diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,48633 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.559865092748735, + "eval_steps": 100, + "global_step": 2700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completion_length": 212.203125, + "epoch": 0.0016863406408094434, + "grad_norm": 4.541860923771718, + "kl": 0.0, + "learning_rate": 9.99662731871838e-07, + "loss": 0.0, + "reward": 1.3358332514762878, + "reward_std": 0.7925846576690674, + "rewards/final_reward": 0.08555527292116892, + "rewards/mask_iou_reward": 0.04277763646058446, + "rewards/sam_format_reward": 0.75, + "rewards/sam_reward_func_ultra": 0.19520824775099754, + "rewards/thk_ans_format_reward": 0.390625, + "step": 1, + "think_completion_length": 54.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.484375, + "epoch": 0.003372681281618887, + "grad_norm": 10.409662000899418, + "kl": 0.000614166259765625, + "learning_rate": 9.993254637436761e-07, + "loss": 0.0, + "reward": 1.3770169019699097, + "reward_std": 0.7793349027633667, + "rewards/final_reward": 0.17952230510610304, + "rewards/mask_iou_reward": 0.08976115255305152, + "rewards/sam_format_reward": 0.828125, + "rewards/sam_reward_func_ultra": 0.17389196157455444, + "rewards/thk_ans_format_reward": 0.375, + "step": 2, + "think_completion_length": 61.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 243.328125, + "epoch": 0.00505902192242833, + "grad_norm": 7.696018374346988, + "kl": 0.0007190704345703125, + "learning_rate": 9.989881956155142e-07, + "loss": 0.0, + "reward": 1.3987104892730713, + "reward_std": 0.7126790881156921, + "rewards/final_reward": 0.13702087713408737, + "rewards/mask_iou_reward": 0.06851043856704368, + "rewards/sam_format_reward": 0.796875, + "rewards/sam_reward_func_ultra": 0.08621050044894218, + "rewards/thk_ans_format_reward": 0.515625, + "step": 3, + "think_completion_length": 90.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.375, + "epoch": 0.006745362563237774, + "grad_norm": 7.557573164502257, + "kl": 0.00098419189453125, + "learning_rate": 9.986509274873523e-07, + "loss": 0.0, + "reward": 1.5465224385261536, + "reward_std": 0.8511916399002075, + "rewards/final_reward": 0.41226695348936837, + "rewards/mask_iou_reward": 0.20613347674468419, + "rewards/sam_format_reward": 0.8125, + "rewards/sam_reward_func_ultra": 0.3121473789215088, + "rewards/thk_ans_format_reward": 0.421875, + "step": 4, + "think_completion_length": 82.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 236.0, + "epoch": 0.008431703204047217, + "grad_norm": 7.6366763281725065, + "kl": 0.001277923583984375, + "learning_rate": 9.983136593591906e-07, + "loss": 0.0, + "reward": 2.1739466190338135, + "reward_std": 1.1614585518836975, + "rewards/final_reward": 0.6282551614601821, + "rewards/mask_iou_reward": 0.31412758073009106, + "rewards/sam_format_reward": 0.78125, + "rewards/sam_reward_func_ultra": 0.7520716190338135, + "rewards/thk_ans_format_reward": 0.640625, + "step": 5, + "think_completion_length": 100.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.140625, + "epoch": 0.01011804384485666, + "grad_norm": 2.912091121635317, + "kl": 0.00293731689453125, + "learning_rate": 9.979763912310287e-07, + "loss": 0.0, + "reward": 1.8722057342529297, + "reward_std": 0.7133974432945251, + "rewards/final_reward": 0.36722210535603916, + "rewards/mask_iou_reward": 0.18361105267801958, + "rewards/sam_format_reward": 0.84375, + "rewards/sam_reward_func_ultra": 0.2784557491540909, + "rewards/thk_ans_format_reward": 0.75, + "step": 6, + "think_completion_length": 110.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.640625, + "epoch": 0.011804384485666104, + "grad_norm": 5.0167749766290965, + "kl": 0.0034637451171875, + "learning_rate": 9.976391231028668e-07, + "loss": 0.0, + "reward": 2.0893322825431824, + "reward_std": 0.7956610918045044, + "rewards/final_reward": 0.23091329445268022, + "rewards/mask_iou_reward": 0.11545664722634011, + "rewards/sam_format_reward": 0.859375, + "rewards/sam_reward_func_ultra": 0.40183228999376297, + "rewards/thk_ans_format_reward": 0.828125, + "step": 7, + "think_completion_length": 92.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.109375, + "epoch": 0.013490725126475547, + "grad_norm": 4.904823389255345, + "kl": 0.0058135986328125, + "learning_rate": 9.973018549747049e-07, + "loss": 0.0, + "reward": 1.9315199851989746, + "reward_std": 0.7769245803356171, + "rewards/final_reward": 0.32775235227855815, + "rewards/mask_iou_reward": 0.16387617613927907, + "rewards/sam_format_reward": 0.890625, + "rewards/sam_reward_func_ultra": 0.3065200597047806, + "rewards/thk_ans_format_reward": 0.734375, + "step": 8, + "think_completion_length": 100.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.265625, + "epoch": 0.01517706576728499, + "grad_norm": 7.32651281879669, + "kl": 0.00531005859375, + "learning_rate": 9.96964586846543e-07, + "loss": 0.0, + "reward": 2.4003329277038574, + "reward_std": 0.7638080418109894, + "rewards/final_reward": 0.6744501600850431, + "rewards/mask_iou_reward": 0.33722508004252155, + "rewards/sam_format_reward": 0.9375, + "rewards/sam_reward_func_ultra": 0.6659578680992126, + "rewards/thk_ans_format_reward": 0.796875, + "step": 9, + "think_completion_length": 95.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 221.0625, + "epoch": 0.016863406408094434, + "grad_norm": 2.8369350027224343, + "kl": 0.0061187744140625, + "learning_rate": 9.96627318718381e-07, + "loss": 0.0, + "reward": 2.134338617324829, + "reward_std": 0.6036363840103149, + "rewards/final_reward": 0.19468699022801095, + "rewards/mask_iou_reward": 0.09734349511400547, + "rewards/sam_format_reward": 0.90625, + "rewards/sam_reward_func_ultra": 0.3218386247754097, + "rewards/thk_ans_format_reward": 0.90625, + "step": 10, + "think_completion_length": 138.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.484375, + "epoch": 0.01854974704890388, + "grad_norm": 4.726322830555546, + "kl": 0.00331878662109375, + "learning_rate": 9.962900505902191e-07, + "loss": 0.0, + "reward": 2.158900499343872, + "reward_std": 0.5658632814884186, + "rewards/final_reward": 0.3395835136590783, + "rewards/mask_iou_reward": 0.16979175682953915, + "rewards/sam_format_reward": 0.921875, + "rewards/sam_reward_func_ultra": 0.29952552914619446, + "rewards/thk_ans_format_reward": 0.9375, + "step": 11, + "think_completion_length": 138.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 228.8125, + "epoch": 0.02023608768971332, + "grad_norm": 5.799680733497899, + "kl": 0.005096435546875, + "learning_rate": 9.959527824620572e-07, + "loss": 0.0, + "reward": 2.2421607971191406, + "reward_std": 0.545092910528183, + "rewards/final_reward": 0.28276694921239226, + "rewards/mask_iou_reward": 0.14138347460619613, + "rewards/sam_format_reward": 0.9375, + "rewards/sam_reward_func_ultra": 0.320285826921463, + "rewards/thk_ans_format_reward": 0.984375, + "step": 12, + "think_completion_length": 121.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.671875, + "epoch": 0.021922428330522766, + "grad_norm": 10.411892938228622, + "kl": 0.0047149658203125, + "learning_rate": 9.956155143338955e-07, + "loss": 0.0, + "reward": 2.1514652967453003, + "reward_std": 0.46626925468444824, + "rewards/final_reward": 0.10493260642775627, + "rewards/mask_iou_reward": 0.052466303213878136, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 0.2452152743935585, + "rewards/thk_ans_format_reward": 0.9375, + "step": 13, + "think_completion_length": 139.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.578125, + "epoch": 0.023608768971332208, + "grad_norm": 3.988672789225242, + "kl": 0.0039215087890625, + "learning_rate": 9.952782462057336e-07, + "loss": 0.0, + "reward": 2.428071618080139, + "reward_std": 0.4563491642475128, + "rewards/final_reward": 0.6773734275020717, + "rewards/mask_iou_reward": 0.3386867137510359, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.4749467372894287, + "rewards/thk_ans_format_reward": 0.953125, + "step": 14, + "think_completion_length": 119.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.109375, + "epoch": 0.025295109612141653, + "grad_norm": 4.0623051435163635, + "kl": 0.0120391845703125, + "learning_rate": 9.949409780775717e-07, + "loss": 0.0, + "reward": 2.3204623460769653, + "reward_std": 0.46338681876659393, + "rewards/final_reward": 0.25837267105119793, + "rewards/mask_iou_reward": 0.12918633552559897, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.36733730882406235, + "rewards/thk_ans_format_reward": 0.96875, + "step": 15, + "think_completion_length": 110.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.546875, + "epoch": 0.026981450252951095, + "grad_norm": 4.023141614046889, + "kl": 0.0052947998046875, + "learning_rate": 9.946037099494098e-07, + "loss": 0.0, + "reward": 2.421836733818054, + "reward_std": 0.47443249821662903, + "rewards/final_reward": 0.7746925003730644, + "rewards/mask_iou_reward": 0.3873462501865322, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.48433683812618256, + "rewards/thk_ans_format_reward": 0.953125, + "step": 16, + "think_completion_length": 89.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.015625, + "epoch": 0.02866779089376054, + "grad_norm": 3.8148204524014933, + "kl": 0.00437164306640625, + "learning_rate": 9.942664418212479e-07, + "loss": 0.0, + "reward": 2.22139310836792, + "reward_std": 0.3197246938943863, + "rewards/final_reward": 0.280598368183643, + "rewards/mask_iou_reward": 0.1402991840918215, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 0.2526431120932102, + "rewards/thk_ans_format_reward": 1.0, + "step": 17, + "think_completion_length": 102.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.4375, + "epoch": 0.03035413153456998, + "grad_norm": 6.159701968434453, + "kl": 0.0063934326171875, + "learning_rate": 9.93929173693086e-07, + "loss": 0.0, + "reward": 2.3376920223236084, + "reward_std": 0.4732118546962738, + "rewards/final_reward": 0.41400893951175743, + "rewards/mask_iou_reward": 0.20700446975587872, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.35331688821315765, + "rewards/thk_ans_format_reward": 1.0, + "step": 18, + "think_completion_length": 126.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.28125, + "epoch": 0.03204047217537943, + "grad_norm": 7.068604665156501, + "kl": 0.00653076171875, + "learning_rate": 9.93591905564924e-07, + "loss": 0.0, + "reward": 2.571447491645813, + "reward_std": 0.39502865076065063, + "rewards/final_reward": 0.11502420152389195, + "rewards/mask_iou_reward": 0.057512100761945975, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.587072491645813, + "rewards/thk_ans_format_reward": 1.0, + "step": 19, + "think_completion_length": 120.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.09375, + "epoch": 0.03372681281618887, + "grad_norm": 6.134636624813015, + "kl": 0.0063323974609375, + "learning_rate": 9.932546374367621e-07, + "loss": 0.0, + "reward": 2.1835756301879883, + "reward_std": 0.2918053865432739, + "rewards/final_reward": 0.34969948763247793, + "rewards/mask_iou_reward": 0.17484974381623897, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.23045063391327858, + "rewards/thk_ans_format_reward": 0.96875, + "step": 20, + "think_completion_length": 106.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.171875, + "epoch": 0.03541315345699832, + "grad_norm": 5.003282709985643, + "kl": 0.0080413818359375, + "learning_rate": 9.929173693086002e-07, + "loss": 0.0, + "reward": 2.1113470792770386, + "reward_std": 0.3219098150730133, + "rewards/final_reward": 0.2392562255838127, + "rewards/mask_iou_reward": 0.11962811279190635, + "rewards/sam_format_reward": 0.953125, + "rewards/sam_reward_func_ultra": 0.1582220196723938, + "rewards/thk_ans_format_reward": 1.0, + "step": 21, + "think_completion_length": 104.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.703125, + "epoch": 0.03709949409780776, + "grad_norm": 7.420447224924629, + "kl": 0.007659912109375, + "learning_rate": 9.925801011804385e-07, + "loss": 0.0, + "reward": 2.3550353050231934, + "reward_std": 0.47632284462451935, + "rewards/final_reward": 0.7523633924279491, + "rewards/mask_iou_reward": 0.37618169621397457, + "rewards/sam_format_reward": 0.953125, + "rewards/sam_reward_func_ultra": 0.41753529757261276, + "rewards/thk_ans_format_reward": 0.984375, + "step": 22, + "think_completion_length": 111.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.515625, + "epoch": 0.0387858347386172, + "grad_norm": 6.934608630709545, + "kl": 0.009063720703125, + "learning_rate": 9.922428330522766e-07, + "loss": 0.0, + "reward": 2.567542314529419, + "reward_std": 0.31761983036994934, + "rewards/final_reward": 0.7383073887884706, + "rewards/mask_iou_reward": 0.3691536943942353, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.5831672549247742, + "rewards/thk_ans_format_reward": 1.0, + "step": 23, + "think_completion_length": 100.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.3125, + "epoch": 0.04047217537942664, + "grad_norm": 9.343990713381167, + "kl": 0.009674072265625, + "learning_rate": 9.919055649241147e-07, + "loss": 0.0, + "reward": 2.570692539215088, + "reward_std": 0.40113507211208344, + "rewards/final_reward": 0.06680689264378081, + "rewards/mask_iou_reward": 0.033403446321890407, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.5863174498081207, + "rewards/thk_ans_format_reward": 1.0, + "step": 24, + "think_completion_length": 115.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.546875, + "epoch": 0.04215851602023609, + "grad_norm": 11.581955402368543, + "kl": 0.011810302734375, + "learning_rate": 9.915682967959528e-07, + "loss": 0.0, + "reward": 2.5906275510787964, + "reward_std": 0.4756350666284561, + "rewards/final_reward": 0.4344880669538129, + "rewards/mask_iou_reward": 0.21724403347690646, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5906275063753128, + "rewards/thk_ans_format_reward": 1.0, + "step": 25, + "think_completion_length": 87.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.734375, + "epoch": 0.04384485666104553, + "grad_norm": 3.7116820186026, + "kl": 0.0115966796875, + "learning_rate": 9.912310286677909e-07, + "loss": 0.0, + "reward": 2.8216378688812256, + "reward_std": 0.6313284933567047, + "rewards/final_reward": 0.9093900982781415, + "rewards/mask_iou_reward": 0.4546950491390708, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.8528877198696136, + "rewards/thk_ans_format_reward": 0.984375, + "step": 26, + "think_completion_length": 83.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.421875, + "epoch": 0.045531197301854974, + "grad_norm": 9.778107173702582, + "kl": 0.0108642578125, + "learning_rate": 9.90893760539629e-07, + "loss": 0.0, + "reward": 2.408462643623352, + "reward_std": 0.33236178010702133, + "rewards/final_reward": 0.34480383441485996, + "rewards/mask_iou_reward": 0.17240191720742998, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.4240875542163849, + "rewards/thk_ans_format_reward": 1.0, + "step": 27, + "think_completion_length": 105.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 213.140625, + "epoch": 0.047217537942664416, + "grad_norm": 3.439777122996586, + "kl": 0.0130615234375, + "learning_rate": 9.90556492411467e-07, + "loss": 0.0, + "reward": 2.4471945762634277, + "reward_std": 0.60556361079216, + "rewards/final_reward": 0.07983588152777385, + "rewards/mask_iou_reward": 0.039917940763886925, + "rewards/sam_format_reward": 0.953125, + "rewards/sam_reward_func_ultra": 0.5409445911645889, + "rewards/thk_ans_format_reward": 0.953125, + "step": 28, + "think_completion_length": 82.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.359375, + "epoch": 0.048903878583473864, + "grad_norm": 5.727453114810501, + "kl": 0.01263427734375, + "learning_rate": 9.902192242833051e-07, + "loss": 0.0, + "reward": 2.2731913328170776, + "reward_std": 0.3333955407142639, + "rewards/final_reward": 0.17951628004589953, + "rewards/mask_iou_reward": 0.08975814002294977, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.3044413551688194, + "rewards/thk_ans_format_reward": 0.984375, + "step": 29, + "think_completion_length": 81.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.203125, + "epoch": 0.050590219224283306, + "grad_norm": 3.5572722758780215, + "kl": 0.01251220703125, + "learning_rate": 9.898819561551432e-07, + "loss": 0.0, + "reward": 2.581373453140259, + "reward_std": 0.4498617798089981, + "rewards/final_reward": 0.4765882465393859, + "rewards/mask_iou_reward": 0.23829412326969296, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.5969983041286469, + "rewards/thk_ans_format_reward": 1.0, + "step": 30, + "think_completion_length": 110.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.953125, + "epoch": 0.05227655986509275, + "grad_norm": 6.201949505321076, + "kl": 0.013153076171875, + "learning_rate": 9.895446880269815e-07, + "loss": 0.0, + "reward": 2.519645571708679, + "reward_std": 0.5184344947338104, + "rewards/final_reward": 0.20871607362538738, + "rewards/mask_iou_reward": 0.10435803681269369, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.5508955717086792, + "rewards/thk_ans_format_reward": 0.984375, + "step": 31, + "think_completion_length": 74.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.734375, + "epoch": 0.05396290050590219, + "grad_norm": 3.1357531664931564, + "kl": 0.014801025390625, + "learning_rate": 9.892074198988196e-07, + "loss": 0.0, + "reward": 2.109021544456482, + "reward_std": 0.2812964990735054, + "rewards/final_reward": 0.17978053567590996, + "rewards/mask_iou_reward": 0.08989026783795498, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.12464653328061104, + "rewards/thk_ans_format_reward": 1.0, + "step": 32, + "think_completion_length": 103.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.203125, + "epoch": 0.05564924114671164, + "grad_norm": 7.359110139858053, + "kl": 0.013458251953125, + "learning_rate": 9.888701517706575e-07, + "loss": 0.0, + "reward": 3.050374388694763, + "reward_std": 0.5476076006889343, + "rewards/final_reward": 1.2483584376076524, + "rewards/mask_iou_reward": 0.6241792188038262, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.050374448299408, + "rewards/thk_ans_format_reward": 1.0, + "step": 33, + "think_completion_length": 85.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.390625, + "epoch": 0.05733558178752108, + "grad_norm": 3.3492929495520656, + "kl": 0.01727294921875, + "learning_rate": 9.885328836424958e-07, + "loss": 0.0, + "reward": 2.332158923149109, + "reward_std": 0.48627421259880066, + "rewards/final_reward": 0.2444915259100053, + "rewards/mask_iou_reward": 0.12224576295500265, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.3477838523685932, + "rewards/thk_ans_format_reward": 0.984375, + "step": 34, + "think_completion_length": 92.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.859375, + "epoch": 0.05902192242833052, + "grad_norm": 5.274814211888088, + "kl": 0.01641845703125, + "learning_rate": 9.881956155143339e-07, + "loss": 0.0, + "reward": 2.788159489631653, + "reward_std": 0.47747406363487244, + "rewards/final_reward": 0.6692964275570692, + "rewards/mask_iou_reward": 0.3346482137785346, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7881594300270081, + "rewards/thk_ans_format_reward": 1.0, + "step": 35, + "think_completion_length": 101.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.4375, + "epoch": 0.06070826306913996, + "grad_norm": 3.8878235168317232, + "kl": 0.01806640625, + "learning_rate": 9.87858347386172e-07, + "loss": 0.0, + "reward": 2.483630895614624, + "reward_std": 0.35369937121868134, + "rewards/final_reward": 0.6384590901960252, + "rewards/mask_iou_reward": 0.3192295450980126, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.49925583600997925, + "rewards/thk_ans_format_reward": 0.984375, + "step": 36, + "think_completion_length": 94.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.515625, + "epoch": 0.06239460370994941, + "grad_norm": 4.710577774961891, + "kl": 0.018310546875, + "learning_rate": 9.8752107925801e-07, + "loss": 0.0, + "reward": 2.351213812828064, + "reward_std": 0.43042635917663574, + "rewards/final_reward": 0.5359964884167394, + "rewards/mask_iou_reward": 0.2679982442083697, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 0.3980888221412897, + "rewards/thk_ans_format_reward": 0.984375, + "step": 37, + "think_completion_length": 91.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.640625, + "epoch": 0.06408094435075885, + "grad_norm": 6.566714388018845, + "kl": 0.019775390625, + "learning_rate": 9.871838111298481e-07, + "loss": 0.0, + "reward": 2.7930028438568115, + "reward_std": 0.4626040458679199, + "rewards/final_reward": 1.280001967353977, + "rewards/mask_iou_reward": 0.6400009836769885, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7930029332637787, + "rewards/thk_ans_format_reward": 1.0, + "step": 38, + "think_completion_length": 114.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.03125, + "epoch": 0.0657672849915683, + "grad_norm": 4.749445484773544, + "kl": 0.0218505859375, + "learning_rate": 9.868465430016864e-07, + "loss": 0.0, + "reward": 2.3910492658615112, + "reward_std": 0.4347519278526306, + "rewards/final_reward": 0.5976855387780159, + "rewards/mask_iou_reward": 0.29884276938900795, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.40667423605918884, + "rewards/thk_ans_format_reward": 1.0, + "step": 39, + "think_completion_length": 61.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.03125, + "epoch": 0.06745362563237774, + "grad_norm": 6.983534877629888, + "kl": 0.0255126953125, + "learning_rate": 9.865092748735245e-07, + "loss": 0.0, + "reward": 2.32186222076416, + "reward_std": 0.32901330292224884, + "rewards/final_reward": 0.27268833919659746, + "rewards/mask_iou_reward": 0.13634416959829873, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.32186219096183777, + "rewards/thk_ans_format_reward": 1.0, + "step": 40, + "think_completion_length": 74.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.0625, + "epoch": 0.06913996627318718, + "grad_norm": 2.6419405467908934, + "kl": 0.0234375, + "learning_rate": 9.861720067453626e-07, + "loss": 0.0, + "reward": 2.150454521179199, + "reward_std": 0.35950616002082825, + "rewards/final_reward": 0.18317714975195007, + "rewards/mask_iou_reward": 0.09158857487597503, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.18170449137687683, + "rewards/thk_ans_format_reward": 0.984375, + "step": 41, + "think_completion_length": 64.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.609375, + "epoch": 0.07082630691399663, + "grad_norm": 4.05208078269746, + "kl": 0.0238037109375, + "learning_rate": 9.858347386172007e-07, + "loss": 0.0, + "reward": 2.6930134296417236, + "reward_std": 0.4153265655040741, + "rewards/final_reward": 0.7187156032058426, + "rewards/mask_iou_reward": 0.3593578016029213, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6930133700370789, + "rewards/thk_ans_format_reward": 1.0, + "step": 42, + "think_completion_length": 87.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.265625, + "epoch": 0.07251264755480608, + "grad_norm": 8.775407716744624, + "kl": 0.0306396484375, + "learning_rate": 9.854974704890388e-07, + "loss": 0.0, + "reward": 2.2557884454727173, + "reward_std": 0.2892530858516693, + "rewards/final_reward": 0.29538722660033423, + "rewards/mask_iou_reward": 0.14769361330016711, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.25578849017620087, + "rewards/thk_ans_format_reward": 1.0, + "step": 43, + "think_completion_length": 64.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.234375, + "epoch": 0.07419898819561552, + "grad_norm": 4.092938456973902, + "kl": 0.025390625, + "learning_rate": 9.851602023608769e-07, + "loss": 0.0, + "reward": 2.231912851333618, + "reward_std": 0.25391124188899994, + "rewards/final_reward": 0.1966431877886882, + "rewards/mask_iou_reward": 0.0983215938943441, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.23191292583942413, + "rewards/thk_ans_format_reward": 1.0, + "step": 44, + "think_completion_length": 76.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.8125, + "epoch": 0.07588532883642496, + "grad_norm": 3.9861339883345415, + "kl": 0.02789306640625, + "learning_rate": 9.84822934232715e-07, + "loss": 0.0, + "reward": 2.4915008544921875, + "reward_std": 0.3098950535058975, + "rewards/final_reward": 0.48250574585781697, + "rewards/mask_iou_reward": 0.24125287292890849, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.4915008842945099, + "rewards/thk_ans_format_reward": 1.0, + "step": 45, + "think_completion_length": 81.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.671875, + "epoch": 0.0775716694772344, + "grad_norm": 5.193780871736565, + "kl": 0.02783203125, + "learning_rate": 9.84485666104553e-07, + "loss": 0.0, + "reward": 2.3561939001083374, + "reward_std": 0.48239606618881226, + "rewards/final_reward": 0.3691018382312205, + "rewards/mask_iou_reward": 0.18455091911561025, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.356193870306015, + "rewards/thk_ans_format_reward": 1.0, + "step": 46, + "think_completion_length": 71.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.265625, + "epoch": 0.07925801011804384, + "grad_norm": 15.234317760475385, + "kl": 0.0238037109375, + "learning_rate": 9.841483979763911e-07, + "loss": 0.0, + "reward": 2.8241143226623535, + "reward_std": 0.5089195966720581, + "rewards/final_reward": 0.7781608077357804, + "rewards/mask_iou_reward": 0.3890804038678902, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.824114203453064, + "rewards/thk_ans_format_reward": 1.0, + "step": 47, + "think_completion_length": 77.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.671875, + "epoch": 0.08094435075885328, + "grad_norm": 5.321848456796247, + "kl": 0.02398681640625, + "learning_rate": 9.838111298482294e-07, + "loss": 0.0, + "reward": 2.518641948699951, + "reward_std": 0.6300854980945587, + "rewards/final_reward": 0.25648786787986366, + "rewards/mask_iou_reward": 0.12824393393993183, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.549891784787178, + "rewards/thk_ans_format_reward": 0.984375, + "step": 48, + "think_completion_length": 96.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.8125, + "epoch": 0.08263069139966273, + "grad_norm": 4.2051402144970185, + "kl": 0.0277099609375, + "learning_rate": 9.834738617200675e-07, + "loss": 0.0, + "reward": 2.3136563301086426, + "reward_std": 0.33684292435646057, + "rewards/final_reward": 0.25782320440538586, + "rewards/mask_iou_reward": 0.12891160220269293, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.31365638226270676, + "rewards/thk_ans_format_reward": 1.0, + "step": 49, + "think_completion_length": 80.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.09375, + "epoch": 0.08431703204047218, + "grad_norm": 3.5448275921745855, + "kl": 0.03167724609375, + "learning_rate": 9.831365935919054e-07, + "loss": 0.0, + "reward": 2.624325752258301, + "reward_std": 0.5980704128742218, + "rewards/final_reward": 0.7610812306965711, + "rewards/mask_iou_reward": 0.38054061534828554, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 0.6712007820606232, + "rewards/thk_ans_format_reward": 0.984375, + "step": 50, + "think_completion_length": 94.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.625, + "epoch": 0.08600337268128162, + "grad_norm": 13.710511715854196, + "kl": 0.02752685546875, + "learning_rate": 9.827993254637437e-07, + "loss": 0.0, + "reward": 2.9260172843933105, + "reward_std": 0.5077286660671234, + "rewards/final_reward": 0.6013949454238717, + "rewards/mask_iou_reward": 0.30069747271193586, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.926017165184021, + "rewards/thk_ans_format_reward": 1.0, + "step": 51, + "think_completion_length": 89.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.140625, + "epoch": 0.08768971332209106, + "grad_norm": 5.965861687588688, + "kl": 0.0281982421875, + "learning_rate": 9.824620573355818e-07, + "loss": 0.0, + "reward": 2.553568720817566, + "reward_std": 0.39463698863983154, + "rewards/final_reward": 0.7572741895663272, + "rewards/mask_iou_reward": 0.3786370947831636, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 0.5848187357187271, + "rewards/thk_ans_format_reward": 1.0, + "step": 52, + "think_completion_length": 80.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.265625, + "epoch": 0.0893760539629005, + "grad_norm": 5.2272156753878685, + "kl": 0.037109375, + "learning_rate": 9.821247892074199e-07, + "loss": 0.0, + "reward": 2.8453149795532227, + "reward_std": 0.3831482380628586, + "rewards/final_reward": 1.032414764875731, + "rewards/mask_iou_reward": 0.5162073824378655, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.8609398603439331, + "rewards/thk_ans_format_reward": 1.0, + "step": 53, + "think_completion_length": 82.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.96875, + "epoch": 0.09106239460370995, + "grad_norm": 18.25648493841559, + "kl": 0.041748046875, + "learning_rate": 9.81787521079258e-07, + "loss": 0.0, + "reward": 2.4916036128997803, + "reward_std": 0.5656653642654419, + "rewards/final_reward": 0.4259493768200442, + "rewards/mask_iou_reward": 0.2129746884100221, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.49160344898700714, + "rewards/thk_ans_format_reward": 1.0, + "step": 54, + "think_completion_length": 76.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.0, + "epoch": 0.09274873524451939, + "grad_norm": 4.309489924245263, + "kl": 0.0341796875, + "learning_rate": 9.81450252951096e-07, + "loss": 0.0, + "reward": 2.443922281265259, + "reward_std": 0.398771733045578, + "rewards/final_reward": 0.3771699374711056, + "rewards/mask_iou_reward": 0.1885849687355528, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.4595472365617752, + "rewards/thk_ans_format_reward": 1.0, + "step": 55, + "think_completion_length": 96.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.0625, + "epoch": 0.09443507588532883, + "grad_norm": 6.056181466884099, + "kl": 0.047119140625, + "learning_rate": 9.811129848229341e-07, + "loss": 0.0001, + "reward": 2.8348472118377686, + "reward_std": 0.48723451793193817, + "rewards/final_reward": 0.5843060876407499, + "rewards/mask_iou_reward": 0.29215304382037494, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8348471522331238, + "rewards/thk_ans_format_reward": 1.0, + "step": 56, + "think_completion_length": 79.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.890625, + "epoch": 0.09612141652613827, + "grad_norm": 4.78709575873903, + "kl": 0.031494140625, + "learning_rate": 9.807757166947724e-07, + "loss": 0.0, + "reward": 2.881469249725342, + "reward_std": 0.2315191924571991, + "rewards/final_reward": 0.8960268816351212, + "rewards/mask_iou_reward": 0.4480134408175606, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.881469264626503, + "rewards/thk_ans_format_reward": 1.0, + "step": 57, + "think_completion_length": 97.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.9375, + "epoch": 0.09780775716694773, + "grad_norm": 4.489917300832141, + "kl": 0.041015625, + "learning_rate": 9.804384485666103e-07, + "loss": 0.0, + "reward": 2.621418833732605, + "reward_std": 0.3816119581460953, + "rewards/final_reward": 0.3244678704006397, + "rewards/mask_iou_reward": 0.16223393520031984, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6370438188314438, + "rewards/thk_ans_format_reward": 0.984375, + "step": 58, + "think_completion_length": 77.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.421875, + "epoch": 0.09949409780775717, + "grad_norm": 30.73746270981293, + "kl": 0.9716796875, + "learning_rate": 9.801011804384484e-07, + "loss": 0.001, + "reward": 2.356270670890808, + "reward_std": 0.32067833840847015, + "rewards/final_reward": 0.23782315432838275, + "rewards/mask_iou_reward": 0.11891157716419137, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.3875206280499697, + "rewards/thk_ans_format_reward": 0.984375, + "step": 59, + "think_completion_length": 81.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.859375, + "epoch": 0.10118043844856661, + "grad_norm": 7.300470439382993, + "kl": 0.0509033203125, + "learning_rate": 9.797639123102867e-07, + "loss": 0.0001, + "reward": 2.279364824295044, + "reward_std": 0.3495059013366699, + "rewards/final_reward": 0.3413407541577943, + "rewards/mask_iou_reward": 0.17067037707889715, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.2793649360537529, + "rewards/thk_ans_format_reward": 1.0, + "step": 60, + "think_completion_length": 82.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.375, + "epoch": 0.10286677908937605, + "grad_norm": 3.6231139230653238, + "kl": 0.048095703125, + "learning_rate": 9.794266441821248e-07, + "loss": 0.0, + "reward": 2.9439969062805176, + "reward_std": 0.44367513060569763, + "rewards/final_reward": 1.0699706439093257, + "rewards/mask_iou_reward": 0.5349853219546629, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9439970254898071, + "rewards/thk_ans_format_reward": 1.0, + "step": 61, + "think_completion_length": 74.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.140625, + "epoch": 0.1045531197301855, + "grad_norm": 4.731110310007667, + "kl": 0.041748046875, + "learning_rate": 9.790893760539629e-07, + "loss": 0.0, + "reward": 2.3657991886138916, + "reward_std": 0.3336651027202606, + "rewards/final_reward": 0.23778256411692913, + "rewards/mask_iou_reward": 0.11889128205846457, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.38142427057027817, + "rewards/thk_ans_format_reward": 1.0, + "step": 62, + "think_completion_length": 69.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.734375, + "epoch": 0.10623946037099494, + "grad_norm": 4.332128989884373, + "kl": 0.045166015625, + "learning_rate": 9.78752107925801e-07, + "loss": 0.0, + "reward": 2.584197163581848, + "reward_std": 0.36240070313215256, + "rewards/final_reward": 1.0086378407320715, + "rewards/mask_iou_reward": 0.5043189203660358, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5841971933841705, + "rewards/thk_ans_format_reward": 1.0, + "step": 63, + "think_completion_length": 66.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.375, + "epoch": 0.10792580101180438, + "grad_norm": 6.430450000313298, + "kl": 0.0435791015625, + "learning_rate": 9.78414839797639e-07, + "loss": 0.0, + "reward": 2.966284394264221, + "reward_std": 0.44423648715019226, + "rewards/final_reward": 0.9652030991016378, + "rewards/mask_iou_reward": 0.4826015495508189, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9662843346595764, + "rewards/thk_ans_format_reward": 1.0, + "step": 64, + "think_completion_length": 70.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.75, + "epoch": 0.10961214165261383, + "grad_norm": 6.607035457021729, + "kl": 0.0462646484375, + "learning_rate": 9.780775716694773e-07, + "loss": 0.0, + "reward": 2.5945472717285156, + "reward_std": 0.43930642306804657, + "rewards/final_reward": 0.5589171176406154, + "rewards/mask_iou_reward": 0.2794585588203077, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5945473164319992, + "rewards/thk_ans_format_reward": 1.0, + "step": 65, + "think_completion_length": 68.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.09375, + "epoch": 0.11129848229342328, + "grad_norm": 7.237969962856227, + "kl": 0.0462646484375, + "learning_rate": 9.777403035413154e-07, + "loss": 0.0, + "reward": 2.5197932720184326, + "reward_std": 0.361030712723732, + "rewards/final_reward": 0.8137916503818017, + "rewards/mask_iou_reward": 0.40689582519090084, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5197932720184326, + "rewards/thk_ans_format_reward": 1.0, + "step": 66, + "think_completion_length": 78.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.984375, + "epoch": 0.11298482293423272, + "grad_norm": 3.667449916530634, + "kl": 0.052734375, + "learning_rate": 9.774030354131533e-07, + "loss": 0.0001, + "reward": 2.8902995586395264, + "reward_std": 0.3441592901945114, + "rewards/final_reward": 0.7054827168834719, + "rewards/mask_iou_reward": 0.35274135844173593, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8902994990348816, + "rewards/thk_ans_format_reward": 1.0, + "step": 67, + "think_completion_length": 84.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.078125, + "epoch": 0.11467116357504216, + "grad_norm": 4.636281295895278, + "kl": 0.05419921875, + "learning_rate": 9.770657672849916e-07, + "loss": 0.0001, + "reward": 3.043671727180481, + "reward_std": 0.4341175705194473, + "rewards/final_reward": 0.8562785388668822, + "rewards/mask_iou_reward": 0.4281392694334411, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0436716675758362, + "rewards/thk_ans_format_reward": 1.0, + "step": 68, + "think_completion_length": 63.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.890625, + "epoch": 0.1163575042158516, + "grad_norm": 6.675883431911974, + "kl": 0.05029296875, + "learning_rate": 9.767284991568297e-07, + "loss": 0.0001, + "reward": 2.569810628890991, + "reward_std": 0.43355831503868103, + "rewards/final_reward": 0.9403461534095212, + "rewards/mask_iou_reward": 0.4701730767047606, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5698107182979584, + "rewards/thk_ans_format_reward": 1.0, + "step": 69, + "think_completion_length": 81.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.046875, + "epoch": 0.11804384485666104, + "grad_norm": 3.7985544472742885, + "kl": 0.05224609375, + "learning_rate": 9.763912310286678e-07, + "loss": 0.0, + "reward": 2.5393481254577637, + "reward_std": 0.37574321031570435, + "rewards/final_reward": 0.5661758463546982, + "rewards/mask_iou_reward": 0.2830879231773491, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5393481552600861, + "rewards/thk_ans_format_reward": 1.0, + "step": 70, + "think_completion_length": 60.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.03125, + "epoch": 0.11973018549747048, + "grad_norm": 9.509296755104442, + "kl": 0.050048828125, + "learning_rate": 9.760539629005059e-07, + "loss": 0.0, + "reward": 2.6844829320907593, + "reward_std": 0.25827430188655853, + "rewards/final_reward": 0.9556732039750488, + "rewards/mask_iou_reward": 0.4778366019875244, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6844830363988876, + "rewards/thk_ans_format_reward": 1.0, + "step": 71, + "think_completion_length": 69.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.8125, + "epoch": 0.12141652613827993, + "grad_norm": 6.190314653353028, + "kl": 0.050048828125, + "learning_rate": 9.75716694772344e-07, + "loss": 0.0001, + "reward": 2.965428590774536, + "reward_std": 0.4965183287858963, + "rewards/final_reward": 1.020839803331826, + "rewards/mask_iou_reward": 0.510419901665913, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9654284715652466, + "rewards/thk_ans_format_reward": 1.0, + "step": 72, + "think_completion_length": 73.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.375, + "epoch": 0.12310286677908938, + "grad_norm": 3.917708434827429, + "kl": 0.055908203125, + "learning_rate": 9.75379426644182e-07, + "loss": 0.0001, + "reward": 2.393458843231201, + "reward_std": 0.44205300509929657, + "rewards/final_reward": 0.330337392985688, + "rewards/mask_iou_reward": 0.165168696492844, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.40908390283584595, + "rewards/thk_ans_format_reward": 1.0, + "step": 73, + "think_completion_length": 73.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.640625, + "epoch": 0.12478920741989882, + "grad_norm": 18.101921577359803, + "kl": 0.0517578125, + "learning_rate": 9.750421585160203e-07, + "loss": 0.0001, + "reward": 2.5843217372894287, + "reward_std": 0.3954938128590584, + "rewards/final_reward": 0.5663095013474037, + "rewards/mask_iou_reward": 0.2831547506737018, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5843217074871063, + "rewards/thk_ans_format_reward": 1.0, + "step": 74, + "think_completion_length": 61.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.75, + "epoch": 0.12647554806070826, + "grad_norm": 4.985492957391431, + "kl": 0.0654296875, + "learning_rate": 9.747048903878582e-07, + "loss": 0.0001, + "reward": 2.6875481605529785, + "reward_std": 0.3493267670273781, + "rewards/final_reward": 0.8727965459469798, + "rewards/mask_iou_reward": 0.4363982729734899, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6875482201576233, + "rewards/thk_ans_format_reward": 1.0, + "step": 75, + "think_completion_length": 63.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.203125, + "epoch": 0.1281618887015177, + "grad_norm": 4.3007506207368875, + "kl": 0.0550537109375, + "learning_rate": 9.743676222596963e-07, + "loss": 0.0001, + "reward": 2.5128692388534546, + "reward_std": 0.2817462384700775, + "rewards/final_reward": 0.7190053530698697, + "rewards/mask_iou_reward": 0.35950267653493484, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5128692984580994, + "rewards/thk_ans_format_reward": 1.0, + "step": 76, + "think_completion_length": 65.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.0, + "epoch": 0.12984822934232715, + "grad_norm": 4.301075928648367, + "kl": 0.0494384765625, + "learning_rate": 9.740303541315346e-07, + "loss": 0.0, + "reward": 2.524822235107422, + "reward_std": 0.5233335793018341, + "rewards/final_reward": 0.18636864728108127, + "rewards/mask_iou_reward": 0.09318432364054063, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 0.5560723841190338, + "rewards/thk_ans_format_reward": 1.0, + "step": 77, + "think_completion_length": 66.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.28125, + "epoch": 0.1315345699831366, + "grad_norm": 8.000478470083442, + "kl": 0.0528564453125, + "learning_rate": 9.736930860033727e-07, + "loss": 0.0001, + "reward": 2.4821194410324097, + "reward_std": 0.36987268924713135, + "rewards/final_reward": 0.38024887171918265, + "rewards/mask_iou_reward": 0.19012443585959132, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.4821194261312485, + "rewards/thk_ans_format_reward": 1.0, + "step": 78, + "think_completion_length": 78.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.84375, + "epoch": 0.13322091062394603, + "grad_norm": 13.794375657986603, + "kl": 0.0489501953125, + "learning_rate": 9.733558178752108e-07, + "loss": 0.0, + "reward": 2.5945109128952026, + "reward_std": 0.49958792328834534, + "rewards/final_reward": 0.562274788805415, + "rewards/mask_iou_reward": 0.2811373944027075, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5945108532905579, + "rewards/thk_ans_format_reward": 1.0, + "step": 79, + "think_completion_length": 70.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.40625, + "epoch": 0.13490725126475547, + "grad_norm": 9.463013735013696, + "kl": 0.052734375, + "learning_rate": 9.730185497470489e-07, + "loss": 0.0001, + "reward": 3.1495691537857056, + "reward_std": 0.3907051086425781, + "rewards/final_reward": 0.943087603650786, + "rewards/mask_iou_reward": 0.471543801825393, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1495689749717712, + "rewards/thk_ans_format_reward": 1.0, + "step": 80, + "think_completion_length": 65.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.375, + "epoch": 0.13659359190556492, + "grad_norm": 7.983156067058834, + "kl": 0.0528564453125, + "learning_rate": 9.72681281618887e-07, + "loss": 0.0001, + "reward": 2.804241895675659, + "reward_std": 0.5239145308732986, + "rewards/final_reward": 0.5999701674591078, + "rewards/mask_iou_reward": 0.2999850837295539, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8042419850826263, + "rewards/thk_ans_format_reward": 1.0, + "step": 81, + "think_completion_length": 70.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.203125, + "epoch": 0.13827993254637436, + "grad_norm": 4.445323229695713, + "kl": 0.074951171875, + "learning_rate": 9.72344013490725e-07, + "loss": 0.0001, + "reward": 3.071893572807312, + "reward_std": 0.44676540791988373, + "rewards/final_reward": 1.342211603129101, + "rewards/mask_iou_reward": 0.6711058015645505, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.071893572807312, + "rewards/thk_ans_format_reward": 1.0, + "step": 82, + "think_completion_length": 69.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.46875, + "epoch": 0.1399662731871838, + "grad_norm": 14.40443893510557, + "kl": 0.0650634765625, + "learning_rate": 9.720067453625631e-07, + "loss": 0.0001, + "reward": 2.468700408935547, + "reward_std": 0.41579216718673706, + "rewards/final_reward": 0.4535352431406775, + "rewards/mask_iou_reward": 0.22676762157033875, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.4687004014849663, + "rewards/thk_ans_format_reward": 1.0, + "step": 83, + "think_completion_length": 79.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.546875, + "epoch": 0.14165261382799327, + "grad_norm": 4.915086814801756, + "kl": 0.060791015625, + "learning_rate": 9.716694772344012e-07, + "loss": 0.0001, + "reward": 2.4039013385772705, + "reward_std": 0.25321827083826065, + "rewards/final_reward": 0.24754253484511543, + "rewards/mask_iou_reward": 0.12377126742255772, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.40390123426914215, + "rewards/thk_ans_format_reward": 1.0, + "step": 84, + "think_completion_length": 64.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.421875, + "epoch": 0.1433389544688027, + "grad_norm": 5.806750656488714, + "kl": 0.060302734375, + "learning_rate": 9.713322091062393e-07, + "loss": 0.0001, + "reward": 3.2514740228652954, + "reward_std": 0.42037880420684814, + "rewards/final_reward": 1.3206563347399518, + "rewards/mask_iou_reward": 0.6603281673699759, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2514739036560059, + "rewards/thk_ans_format_reward": 1.0, + "step": 85, + "think_completion_length": 57.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.8125, + "epoch": 0.14502529510961215, + "grad_norm": 12.274112515128795, + "kl": 0.0693359375, + "learning_rate": 9.709949409780776e-07, + "loss": 0.0001, + "reward": 2.481606125831604, + "reward_std": 0.29363201558589935, + "rewards/final_reward": 0.30051989067811424, + "rewards/mask_iou_reward": 0.15025994533905712, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.4816061407327652, + "rewards/thk_ans_format_reward": 1.0, + "step": 86, + "think_completion_length": 64.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.65625, + "epoch": 0.1467116357504216, + "grad_norm": 3.7263438785125964, + "kl": 0.052490234375, + "learning_rate": 9.706576728499157e-07, + "loss": 0.0001, + "reward": 2.79649555683136, + "reward_std": 0.3482329323887825, + "rewards/final_reward": 1.0713907295997764, + "rewards/mask_iou_reward": 0.5356953647998882, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8121205568313599, + "rewards/thk_ans_format_reward": 0.984375, + "step": 87, + "think_completion_length": 57.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.375, + "epoch": 0.14839797639123103, + "grad_norm": 142.75268382653746, + "kl": 0.061767578125, + "learning_rate": 9.703204047217538e-07, + "loss": 0.0001, + "reward": 3.3002171516418457, + "reward_std": 0.3195580244064331, + "rewards/final_reward": 0.9610996100313889, + "rewards/mask_iou_reward": 0.48054980501569444, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3002171516418457, + "rewards/thk_ans_format_reward": 1.0, + "step": 88, + "think_completion_length": 63.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.453125, + "epoch": 0.15008431703204048, + "grad_norm": 6.2755111526202905, + "kl": 0.08056640625, + "learning_rate": 9.699831365935918e-07, + "loss": 0.0001, + "reward": 2.781501293182373, + "reward_std": 0.49102330207824707, + "rewards/final_reward": 0.7359130373387389, + "rewards/mask_iou_reward": 0.36795651866936946, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.781501293182373, + "rewards/thk_ans_format_reward": 1.0, + "step": 89, + "think_completion_length": 61.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.203125, + "epoch": 0.15177065767284992, + "grad_norm": 4.883317094191715, + "kl": 0.06640625, + "learning_rate": 9.6964586846543e-07, + "loss": 0.0001, + "reward": 2.446820020675659, + "reward_std": 0.4249372184276581, + "rewards/final_reward": 0.5249717239067151, + "rewards/mask_iou_reward": 0.26248586195335755, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.44682009518146515, + "rewards/thk_ans_format_reward": 1.0, + "step": 90, + "think_completion_length": 70.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.09375, + "epoch": 0.15345699831365936, + "grad_norm": 4.402591818341303, + "kl": 0.0546875, + "learning_rate": 9.693086003372682e-07, + "loss": 0.0001, + "reward": 2.751936435699463, + "reward_std": 0.2803105264902115, + "rewards/final_reward": 0.7289582917623812, + "rewards/mask_iou_reward": 0.3644791458811906, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7519364431500435, + "rewards/thk_ans_format_reward": 1.0, + "step": 91, + "think_completion_length": 68.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.421875, + "epoch": 0.1551433389544688, + "grad_norm": 4.023932805501269, + "kl": 0.060791015625, + "learning_rate": 9.689713322091061e-07, + "loss": 0.0001, + "reward": 2.698573350906372, + "reward_std": 0.5618922114372253, + "rewards/final_reward": 0.8791768168432108, + "rewards/mask_iou_reward": 0.4395884084216054, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6985732316970825, + "rewards/thk_ans_format_reward": 1.0, + "step": 92, + "think_completion_length": 72.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.015625, + "epoch": 0.15682967959527824, + "grad_norm": 4.913876907058581, + "kl": 0.076171875, + "learning_rate": 9.686340640809442e-07, + "loss": 0.0001, + "reward": 2.4915287494659424, + "reward_std": 0.267090268433094, + "rewards/final_reward": 0.8385419956098626, + "rewards/mask_iou_reward": 0.4192709978049313, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.4915286898612976, + "rewards/thk_ans_format_reward": 1.0, + "step": 93, + "think_completion_length": 68.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.03125, + "epoch": 0.15851602023608768, + "grad_norm": 4.936450902115474, + "kl": 0.06591796875, + "learning_rate": 9.682967959527825e-07, + "loss": 0.0001, + "reward": 2.5161983966827393, + "reward_std": 0.43957073986530304, + "rewards/final_reward": 0.8536540262243562, + "rewards/mask_iou_reward": 0.4268270131121781, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5161983147263527, + "rewards/thk_ans_format_reward": 1.0, + "step": 94, + "think_completion_length": 69.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.734375, + "epoch": 0.16020236087689713, + "grad_norm": 10.081914568218277, + "kl": 0.0517578125, + "learning_rate": 9.679595278246206e-07, + "loss": 0.0001, + "reward": 2.6539634466171265, + "reward_std": 0.3619793802499771, + "rewards/final_reward": 0.5307201105892436, + "rewards/mask_iou_reward": 0.2653600552946218, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6539634168148041, + "rewards/thk_ans_format_reward": 1.0, + "step": 95, + "think_completion_length": 78.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.796875, + "epoch": 0.16188870151770657, + "grad_norm": 3.9564086843292694, + "kl": 0.0625, + "learning_rate": 9.676222596964587e-07, + "loss": 0.0001, + "reward": 2.936839461326599, + "reward_std": 0.43852272629737854, + "rewards/final_reward": 1.3607585043371786, + "rewards/mask_iou_reward": 0.6803792521685893, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.952464371919632, + "rewards/thk_ans_format_reward": 0.984375, + "step": 96, + "think_completion_length": 57.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.25, + "epoch": 0.163575042158516, + "grad_norm": 4.524169586310404, + "kl": 0.06884765625, + "learning_rate": 9.672849915682968e-07, + "loss": 0.0001, + "reward": 2.5702874660491943, + "reward_std": 0.33369340747594833, + "rewards/final_reward": 0.17149679590443948, + "rewards/mask_iou_reward": 0.08574839795221974, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5702873766422272, + "rewards/thk_ans_format_reward": 1.0, + "step": 97, + "think_completion_length": 63.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.28125, + "epoch": 0.16526138279932545, + "grad_norm": 4.8851269262597485, + "kl": 0.059326171875, + "learning_rate": 9.669477234401348e-07, + "loss": 0.0001, + "reward": 2.660242795944214, + "reward_std": 0.42426833510398865, + "rewards/final_reward": 1.1212785651295316, + "rewards/mask_iou_reward": 0.5606392825647658, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6758678257465363, + "rewards/thk_ans_format_reward": 0.984375, + "step": 98, + "think_completion_length": 70.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.625, + "epoch": 0.16694772344013492, + "grad_norm": 6.948977823962185, + "kl": 0.059326171875, + "learning_rate": 9.66610455311973e-07, + "loss": 0.0001, + "reward": 2.9173457622528076, + "reward_std": 0.3428076356649399, + "rewards/final_reward": 0.24583723180278247, + "rewards/mask_iou_reward": 0.12291861590139123, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9173457622528076, + "rewards/thk_ans_format_reward": 1.0, + "step": 99, + "think_completion_length": 78.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.71875, + "epoch": 0.16863406408094436, + "grad_norm": 7.004314858753663, + "kl": 0.076171875, + "learning_rate": 9.66273187183811e-07, + "loss": 0.0001, + "reward": 2.7303144931793213, + "reward_std": 0.3506556749343872, + "rewards/final_reward": 0.8585279743266268, + "rewards/mask_iou_reward": 0.4292639871633134, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7303146123886108, + "rewards/thk_ans_format_reward": 1.0, + "step": 100, + "think_completion_length": 66.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.484375, + "epoch": 0.1703204047217538, + "grad_norm": 9.72836401997028, + "kl": 0.067626953125, + "learning_rate": 9.659359190556491e-07, + "loss": 0.0001, + "reward": 3.545642137527466, + "reward_std": 0.36958497762680054, + "rewards/final_reward": 1.3276030857900536, + "rewards/mask_iou_reward": 0.6638015428950268, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5456423163414001, + "rewards/thk_ans_format_reward": 1.0, + "step": 101, + "think_completion_length": 68.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.421875, + "epoch": 0.17200674536256325, + "grad_norm": 3.9097650791535936, + "kl": 0.06982421875, + "learning_rate": 9.655986509274872e-07, + "loss": 0.0001, + "reward": 2.8055737018585205, + "reward_std": 0.30134592577815056, + "rewards/final_reward": 1.0802610803244432, + "rewards/mask_iou_reward": 0.5401305401622216, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8055737316608429, + "rewards/thk_ans_format_reward": 1.0, + "step": 102, + "think_completion_length": 77.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.375, + "epoch": 0.1736930860033727, + "grad_norm": 12.43241529951544, + "kl": 0.0810546875, + "learning_rate": 9.652613827993255e-07, + "loss": 0.0001, + "reward": 2.6046589612960815, + "reward_std": 0.36044664680957794, + "rewards/final_reward": 0.9996509333597047, + "rewards/mask_iou_reward": 0.49982546667985234, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6046590209007263, + "rewards/thk_ans_format_reward": 1.0, + "step": 103, + "think_completion_length": 75.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.28125, + "epoch": 0.17537942664418213, + "grad_norm": 4.524572883875218, + "kl": 0.099365234375, + "learning_rate": 9.649241146711636e-07, + "loss": 0.0001, + "reward": 2.2724088430404663, + "reward_std": 0.18580714613199234, + "rewards/final_reward": 0.41834230934148653, + "rewards/mask_iou_reward": 0.20917115467074326, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.27240876853466034, + "rewards/thk_ans_format_reward": 1.0, + "step": 104, + "think_completion_length": 79.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.109375, + "epoch": 0.17706576728499157, + "grad_norm": 14.131641702675266, + "kl": 0.06591796875, + "learning_rate": 9.645868465430017e-07, + "loss": 0.0001, + "reward": 2.3781230449676514, + "reward_std": 0.26029431354254484, + "rewards/final_reward": 0.4826050544302346, + "rewards/mask_iou_reward": 0.2413025272151173, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.37812306452542543, + "rewards/thk_ans_format_reward": 1.0, + "step": 105, + "think_completion_length": 97.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.21875, + "epoch": 0.178752107925801, + "grad_norm": 6.944889536872177, + "kl": 0.0791015625, + "learning_rate": 9.642495784148398e-07, + "loss": 0.0001, + "reward": 3.2580912113189697, + "reward_std": 0.3155831843614578, + "rewards/final_reward": 0.9140460384999494, + "rewards/mask_iou_reward": 0.4570230192499747, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2580912709236145, + "rewards/thk_ans_format_reward": 1.0, + "step": 106, + "think_completion_length": 103.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.5, + "epoch": 0.18043844856661045, + "grad_norm": 5.649283035789506, + "kl": 0.0592041015625, + "learning_rate": 9.639123102866778e-07, + "loss": 0.0001, + "reward": 2.871160387992859, + "reward_std": 0.28134259581565857, + "rewards/final_reward": 1.0845442635996905, + "rewards/mask_iou_reward": 0.5422721317998452, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8711603879928589, + "rewards/thk_ans_format_reward": 1.0, + "step": 107, + "think_completion_length": 86.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.4375, + "epoch": 0.1821247892074199, + "grad_norm": 4.979931040893193, + "kl": 0.07080078125, + "learning_rate": 9.63575042158516e-07, + "loss": 0.0001, + "reward": 2.6347672939300537, + "reward_std": 0.3419078588485718, + "rewards/final_reward": 0.7823988325154321, + "rewards/mask_iou_reward": 0.39119941625771604, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6347672492265701, + "rewards/thk_ans_format_reward": 1.0, + "step": 108, + "think_completion_length": 81.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.765625, + "epoch": 0.18381112984822934, + "grad_norm": 16.6980716433444, + "kl": 0.0577392578125, + "learning_rate": 9.63237774030354e-07, + "loss": 0.0001, + "reward": 2.8260412216186523, + "reward_std": 0.2677394151687622, + "rewards/final_reward": 0.9963177445880472, + "rewards/mask_iou_reward": 0.4981588722940236, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8416662067174911, + "rewards/thk_ans_format_reward": 0.984375, + "step": 109, + "think_completion_length": 88.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.828125, + "epoch": 0.18549747048903878, + "grad_norm": 2.9645999481574634, + "kl": 0.05419921875, + "learning_rate": 9.629005059021921e-07, + "loss": 0.0001, + "reward": 2.6315606832504272, + "reward_std": 0.3063789587467909, + "rewards/final_reward": 0.47596354687676634, + "rewards/mask_iou_reward": 0.23798177343838317, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6315606329590082, + "rewards/thk_ans_format_reward": 1.0, + "step": 110, + "think_completion_length": 119.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.359375, + "epoch": 0.18718381112984822, + "grad_norm": 5.187350710181049, + "kl": 0.058349609375, + "learning_rate": 9.625632377740302e-07, + "loss": 0.0001, + "reward": 3.09405779838562, + "reward_std": 0.15743490681052208, + "rewards/final_reward": 1.468626562133811, + "rewards/mask_iou_reward": 0.7343132810669055, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0940579175949097, + "rewards/thk_ans_format_reward": 1.0, + "step": 111, + "think_completion_length": 94.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.0, + "epoch": 0.18887015177065766, + "grad_norm": 4.049281276892838, + "kl": 0.11572265625, + "learning_rate": 9.622259696458685e-07, + "loss": 0.0001, + "reward": 3.07741117477417, + "reward_std": 0.5245492458343506, + "rewards/final_reward": 1.250567170226694, + "rewards/mask_iou_reward": 0.625283585113347, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0774111449718475, + "rewards/thk_ans_format_reward": 1.0, + "step": 112, + "think_completion_length": 72.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.9375, + "epoch": 0.1905564924114671, + "grad_norm": 8.029055466527632, + "kl": 0.054443359375, + "learning_rate": 9.618887015177066e-07, + "loss": 0.0001, + "reward": 2.5914098024368286, + "reward_std": 0.4398697763681412, + "rewards/final_reward": 0.7285501404788846, + "rewards/mask_iou_reward": 0.3642750702394423, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.6070348471403122, + "rewards/thk_ans_format_reward": 1.0, + "step": 113, + "think_completion_length": 78.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.140625, + "epoch": 0.19224283305227655, + "grad_norm": 3.086404264380978, + "kl": 0.05078125, + "learning_rate": 9.615514333895447e-07, + "loss": 0.0001, + "reward": 2.4672300815582275, + "reward_std": 0.29644207656383514, + "rewards/final_reward": 0.706325008253994, + "rewards/mask_iou_reward": 0.353162504126997, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.4672301113605499, + "rewards/thk_ans_format_reward": 1.0, + "step": 114, + "think_completion_length": 80.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.65625, + "epoch": 0.19392917369308602, + "grad_norm": 4.464912668237362, + "kl": 0.050537109375, + "learning_rate": 9.612141652613828e-07, + "loss": 0.0001, + "reward": 2.9204636812210083, + "reward_std": 0.1726682484149933, + "rewards/final_reward": 1.011700142074487, + "rewards/mask_iou_reward": 0.5058500710372436, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9204636216163635, + "rewards/thk_ans_format_reward": 1.0, + "step": 115, + "think_completion_length": 61.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.609375, + "epoch": 0.19561551433389546, + "grad_norm": 4.630079087067869, + "kl": 0.0567626953125, + "learning_rate": 9.608768971332208e-07, + "loss": 0.0001, + "reward": 2.5220160484313965, + "reward_std": 0.2584230601787567, + "rewards/final_reward": 0.5448442829958422, + "rewards/mask_iou_reward": 0.2724221414979211, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5220160484313965, + "rewards/thk_ans_format_reward": 1.0, + "step": 116, + "think_completion_length": 89.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.890625, + "epoch": 0.1973018549747049, + "grad_norm": 4.621872533032156, + "kl": 0.056884765625, + "learning_rate": 9.60539629005059e-07, + "loss": 0.0001, + "reward": 2.9947354793548584, + "reward_std": 0.20887230336666107, + "rewards/final_reward": 0.9475295686720593, + "rewards/mask_iou_reward": 0.47376478433602964, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9947354048490524, + "rewards/thk_ans_format_reward": 1.0, + "step": 117, + "think_completion_length": 80.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.421875, + "epoch": 0.19898819561551434, + "grad_norm": 9.497515578211454, + "kl": 0.063720703125, + "learning_rate": 9.60202360876897e-07, + "loss": 0.0001, + "reward": 2.611419439315796, + "reward_std": 0.36831772327423096, + "rewards/final_reward": 0.7176738346947997, + "rewards/mask_iou_reward": 0.35883691734739986, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6114194691181183, + "rewards/thk_ans_format_reward": 1.0, + "step": 118, + "think_completion_length": 98.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.453125, + "epoch": 0.20067453625632378, + "grad_norm": 55.096724445429054, + "kl": 0.05517578125, + "learning_rate": 9.598650927487351e-07, + "loss": 0.0001, + "reward": 3.0991926193237305, + "reward_std": 0.3100406602025032, + "rewards/final_reward": 0.8859623609390234, + "rewards/mask_iou_reward": 0.4429811804695117, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0991926789283752, + "rewards/thk_ans_format_reward": 1.0, + "step": 119, + "think_completion_length": 90.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.78125, + "epoch": 0.20236087689713322, + "grad_norm": 8.39851702173129, + "kl": 0.04931640625, + "learning_rate": 9.595278246205734e-07, + "loss": 0.0, + "reward": 2.797035336494446, + "reward_std": 0.1637876257300377, + "rewards/final_reward": 0.8639884837569479, + "rewards/mask_iou_reward": 0.43199424187847396, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7970353364944458, + "rewards/thk_ans_format_reward": 1.0, + "step": 120, + "think_completion_length": 89.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.421875, + "epoch": 0.20404721753794267, + "grad_norm": 2.534477601211979, + "kl": 0.043701171875, + "learning_rate": 9.591905564924115e-07, + "loss": -0.0001, + "reward": 2.9477614164352417, + "reward_std": 0.2593442127108574, + "rewards/final_reward": 1.5180886726556748, + "rewards/mask_iou_reward": 0.7590443363278374, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9477613866329193, + "rewards/thk_ans_format_reward": 1.0, + "step": 121, + "think_completion_length": 81.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.609375, + "epoch": 0.2057335581787521, + "grad_norm": 4.377934435913437, + "kl": 0.0535888671875, + "learning_rate": 9.588532883642496e-07, + "loss": 0.0001, + "reward": 2.576362371444702, + "reward_std": 0.25838133692741394, + "rewards/final_reward": 0.2687840057189522, + "rewards/mask_iou_reward": 0.1343920028594761, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5763622224330902, + "rewards/thk_ans_format_reward": 1.0, + "step": 122, + "think_completion_length": 105.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.9375, + "epoch": 0.20741989881956155, + "grad_norm": 6.588359922719636, + "kl": 0.0567626953125, + "learning_rate": 9.585160202360877e-07, + "loss": 0.0001, + "reward": 2.8375957012176514, + "reward_std": 0.4054133892059326, + "rewards/final_reward": 0.9836463211897768, + "rewards/mask_iou_reward": 0.4918231605948884, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8375958502292633, + "rewards/thk_ans_format_reward": 1.0, + "step": 123, + "think_completion_length": 96.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.671875, + "epoch": 0.209106239460371, + "grad_norm": 10.814198077246532, + "kl": 0.0535888671875, + "learning_rate": 9.581787521079258e-07, + "loss": 0.0001, + "reward": 2.4431896209716797, + "reward_std": 0.2660168632864952, + "rewards/final_reward": 0.5794934557994321, + "rewards/mask_iou_reward": 0.28974672789971606, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.44318948313593864, + "rewards/thk_ans_format_reward": 1.0, + "step": 124, + "think_completion_length": 79.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.328125, + "epoch": 0.21079258010118043, + "grad_norm": 4.274399889954174, + "kl": 0.0465087890625, + "learning_rate": 9.578414839797638e-07, + "loss": 0.0, + "reward": 3.0381346940994263, + "reward_std": 0.22179779410362244, + "rewards/final_reward": 1.4394426651939196, + "rewards/mask_iou_reward": 0.7197213325969598, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0381346344947815, + "rewards/thk_ans_format_reward": 1.0, + "step": 125, + "think_completion_length": 102.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.6875, + "epoch": 0.21247892074198987, + "grad_norm": 8.152408435882341, + "kl": 0.0556640625, + "learning_rate": 9.57504215851602e-07, + "loss": 0.0001, + "reward": 2.923964023590088, + "reward_std": 0.2563727870583534, + "rewards/final_reward": 1.089894641666783, + "rewards/mask_iou_reward": 0.5449473208333915, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9239640831947327, + "rewards/thk_ans_format_reward": 1.0, + "step": 126, + "think_completion_length": 85.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.265625, + "epoch": 0.21416526138279932, + "grad_norm": 4.449214680152263, + "kl": 0.0687255859375, + "learning_rate": 9.5716694772344e-07, + "loss": 0.0001, + "reward": 2.740877628326416, + "reward_std": 0.46522799134254456, + "rewards/final_reward": 1.1671571540552383, + "rewards/mask_iou_reward": 0.5835785770276192, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7565025985240936, + "rewards/thk_ans_format_reward": 0.984375, + "step": 127, + "think_completion_length": 91.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.15625, + "epoch": 0.21585160202360876, + "grad_norm": 4.17184352546643, + "kl": 0.060546875, + "learning_rate": 9.56829679595278e-07, + "loss": 0.0001, + "reward": 2.8540847301483154, + "reward_std": 0.4098881930112839, + "rewards/final_reward": 0.3388112149192265, + "rewards/mask_iou_reward": 0.16940560745961325, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.854084700345993, + "rewards/thk_ans_format_reward": 1.0, + "step": 128, + "think_completion_length": 88.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.34375, + "epoch": 0.2175379426644182, + "grad_norm": 15.065555837938753, + "kl": 0.072265625, + "learning_rate": 9.564924114671164e-07, + "loss": 0.0001, + "reward": 3.1309502124786377, + "reward_std": 0.4588165432214737, + "rewards/final_reward": 1.147073274599184, + "rewards/mask_iou_reward": 0.573536637299592, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1309503316879272, + "rewards/thk_ans_format_reward": 1.0, + "step": 129, + "think_completion_length": 92.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.9375, + "epoch": 0.21922428330522767, + "grad_norm": 22.465927435670828, + "kl": 0.08154296875, + "learning_rate": 9.561551433389545e-07, + "loss": 0.0001, + "reward": 2.7864497900009155, + "reward_std": 0.31874626129865646, + "rewards/final_reward": 0.4232256065266388, + "rewards/mask_iou_reward": 0.2116128032633194, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7864498198032379, + "rewards/thk_ans_format_reward": 1.0, + "step": 130, + "think_completion_length": 92.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.3125, + "epoch": 0.2209106239460371, + "grad_norm": 6.419347933512376, + "kl": 0.077880859375, + "learning_rate": 9.558178752107926e-07, + "loss": 0.0001, + "reward": 2.908892869949341, + "reward_std": 0.44153931736946106, + "rewards/final_reward": 0.5672732777281655, + "rewards/mask_iou_reward": 0.28363663886408275, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9088928699493408, + "rewards/thk_ans_format_reward": 1.0, + "step": 131, + "think_completion_length": 71.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.59375, + "epoch": 0.22259696458684655, + "grad_norm": 38.99623994278225, + "kl": 0.086181640625, + "learning_rate": 9.554806070826307e-07, + "loss": 0.0001, + "reward": 3.0554357767105103, + "reward_std": 0.3175787627696991, + "rewards/final_reward": 0.8004080636881445, + "rewards/mask_iou_reward": 0.40020403184407227, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.055435687303543, + "rewards/thk_ans_format_reward": 1.0, + "step": 132, + "think_completion_length": 78.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.65625, + "epoch": 0.224283305227656, + "grad_norm": 3.0018346933655375, + "kl": 0.087646484375, + "learning_rate": 9.551433389544688e-07, + "loss": 0.0001, + "reward": 2.4450541734695435, + "reward_std": 0.34064342081546783, + "rewards/final_reward": 0.4599661168873327, + "rewards/mask_iou_reward": 0.22998305844366634, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.46067917346954346, + "rewards/thk_ans_format_reward": 0.984375, + "step": 133, + "think_completion_length": 79.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.625, + "epoch": 0.22596964586846544, + "grad_norm": 9.578539076710266, + "kl": 0.0859375, + "learning_rate": 9.548060708263068e-07, + "loss": 0.0001, + "reward": 2.931985855102539, + "reward_std": 0.48415741324424744, + "rewards/final_reward": 0.7854134764268852, + "rewards/mask_iou_reward": 0.3927067382134426, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9319857358932495, + "rewards/thk_ans_format_reward": 1.0, + "step": 134, + "think_completion_length": 92.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.484375, + "epoch": 0.22765598650927488, + "grad_norm": 3.9455952839664166, + "kl": 0.0791015625, + "learning_rate": 9.54468802698145e-07, + "loss": 0.0001, + "reward": 2.699129104614258, + "reward_std": 0.46679961681365967, + "rewards/final_reward": 0.6493981932567956, + "rewards/mask_iou_reward": 0.3246990966283978, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6991291344165802, + "rewards/thk_ans_format_reward": 1.0, + "step": 135, + "think_completion_length": 77.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.671875, + "epoch": 0.22934232715008432, + "grad_norm": 26.084465650772835, + "kl": 0.6328125, + "learning_rate": 9.54131534569983e-07, + "loss": 0.0006, + "reward": 2.6946918964385986, + "reward_std": 0.3354320228099823, + "rewards/final_reward": 0.917651972879823, + "rewards/mask_iou_reward": 0.4588259864399115, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7103168219327927, + "rewards/thk_ans_format_reward": 0.984375, + "step": 136, + "think_completion_length": 79.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.171875, + "epoch": 0.23102866779089376, + "grad_norm": 5.895893689415301, + "kl": 0.078125, + "learning_rate": 9.53794266441821e-07, + "loss": 0.0001, + "reward": 2.541081666946411, + "reward_std": 0.22961698472499847, + "rewards/final_reward": 0.9253931676236163, + "rewards/mask_iou_reward": 0.46269658381180817, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5410817265510559, + "rewards/thk_ans_format_reward": 1.0, + "step": 137, + "think_completion_length": 87.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.046875, + "epoch": 0.2327150084317032, + "grad_norm": 4.965800154740723, + "kl": 0.08056640625, + "learning_rate": 9.534569983136593e-07, + "loss": 0.0001, + "reward": 2.9825655221939087, + "reward_std": 0.286454439163208, + "rewards/final_reward": 1.7143570051156443, + "rewards/mask_iou_reward": 0.8571785025578221, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9981906414031982, + "rewards/thk_ans_format_reward": 0.984375, + "step": 138, + "think_completion_length": 92.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.640625, + "epoch": 0.23440134907251264, + "grad_norm": 3.891606818741714, + "kl": 0.1005859375, + "learning_rate": 9.531197301854974e-07, + "loss": 0.0001, + "reward": 2.3226908445358276, + "reward_std": 0.2071321550756693, + "rewards/final_reward": 0.4069366052566161, + "rewards/mask_iou_reward": 0.20346830262830806, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.32269074441865087, + "rewards/thk_ans_format_reward": 1.0, + "step": 139, + "think_completion_length": 84.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.65625, + "epoch": 0.23608768971332209, + "grad_norm": 9.336418443843108, + "kl": 0.087890625, + "learning_rate": 9.527824620573356e-07, + "loss": 0.0001, + "reward": 3.410194993019104, + "reward_std": 0.25575824826955795, + "rewards/final_reward": 1.5624018070481909, + "rewards/mask_iou_reward": 0.7812009035240954, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4101950526237488, + "rewards/thk_ans_format_reward": 1.0, + "step": 140, + "think_completion_length": 77.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.90625, + "epoch": 0.23777403035413153, + "grad_norm": 6.200561227361159, + "kl": 0.09375, + "learning_rate": 9.524451939291737e-07, + "loss": 0.0001, + "reward": 2.8541425466537476, + "reward_std": 0.6211456060409546, + "rewards/final_reward": 1.2935642400096632, + "rewards/mask_iou_reward": 0.6467821200048316, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 0.8853925466537476, + "rewards/thk_ans_format_reward": 1.0, + "step": 141, + "think_completion_length": 78.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.25, + "epoch": 0.23946037099494097, + "grad_norm": 10.65132715665166, + "kl": 0.093017578125, + "learning_rate": 9.521079258010118e-07, + "loss": 0.0001, + "reward": 2.95055615901947, + "reward_std": 0.5230741798877716, + "rewards/final_reward": 0.6756522948538228, + "rewards/mask_iou_reward": 0.3378261474269114, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9505561292171478, + "rewards/thk_ans_format_reward": 1.0, + "step": 142, + "think_completion_length": 65.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.953125, + "epoch": 0.2411467116357504, + "grad_norm": 5.2077714220137405, + "kl": 0.097900390625, + "learning_rate": 9.517706576728499e-07, + "loss": 0.0001, + "reward": 3.337947726249695, + "reward_std": 0.2577322721481323, + "rewards/final_reward": 0.8645473082831757, + "rewards/mask_iou_reward": 0.43227365414158786, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.33794766664505, + "rewards/thk_ans_format_reward": 1.0, + "step": 143, + "think_completion_length": 77.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.03125, + "epoch": 0.24283305227655985, + "grad_norm": 8.046601536672426, + "kl": 0.1103515625, + "learning_rate": 9.51433389544688e-07, + "loss": 0.0001, + "reward": 2.8948484659194946, + "reward_std": 0.25463247299194336, + "rewards/final_reward": 0.524457777173691, + "rewards/mask_iou_reward": 0.2622288885868455, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9104736149311066, + "rewards/thk_ans_format_reward": 0.984375, + "step": 144, + "think_completion_length": 62.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.21875, + "epoch": 0.24451939291736932, + "grad_norm": 5.0074848777872445, + "kl": 0.103759765625, + "learning_rate": 9.51096121416526e-07, + "loss": 0.0001, + "reward": 2.683648943901062, + "reward_std": 0.3750077337026596, + "rewards/final_reward": 0.8593287375390414, + "rewards/mask_iou_reward": 0.4296643687695207, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6836489215493202, + "rewards/thk_ans_format_reward": 1.0, + "step": 145, + "think_completion_length": 73.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.6875, + "epoch": 0.24620573355817876, + "grad_norm": 4.963438244921841, + "kl": 0.1044921875, + "learning_rate": 9.507588532883642e-07, + "loss": 0.0001, + "reward": 2.661731004714966, + "reward_std": 0.3136795163154602, + "rewards/final_reward": 0.4139575300345308, + "rewards/mask_iou_reward": 0.2069787650172654, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6617311537265778, + "rewards/thk_ans_format_reward": 1.0, + "step": 146, + "think_completion_length": 61.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.921875, + "epoch": 0.2478920741989882, + "grad_norm": 9.38267745831431, + "kl": 0.11376953125, + "learning_rate": 9.504215851602023e-07, + "loss": 0.0001, + "reward": 3.2500319480895996, + "reward_std": 0.46346913278102875, + "rewards/final_reward": 1.0314903289685486, + "rewards/mask_iou_reward": 0.5157451644842743, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2500320076942444, + "rewards/thk_ans_format_reward": 1.0, + "step": 147, + "think_completion_length": 72.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.125, + "epoch": 0.24957841483979765, + "grad_norm": 12.293398423965753, + "kl": 0.099853515625, + "learning_rate": 9.500843170320404e-07, + "loss": 0.0001, + "reward": 3.3502708673477173, + "reward_std": 0.4048406034708023, + "rewards/final_reward": 1.096210174595348, + "rewards/mask_iou_reward": 0.548105087297674, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3502709865570068, + "rewards/thk_ans_format_reward": 1.0, + "step": 148, + "think_completion_length": 67.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.65625, + "epoch": 0.25126475548060706, + "grad_norm": 5.36062250866705, + "kl": 0.1181640625, + "learning_rate": 9.497470489038786e-07, + "loss": 0.0001, + "reward": 2.4620732069015503, + "reward_std": 0.2523811161518097, + "rewards/final_reward": 0.3980805396072795, + "rewards/mask_iou_reward": 0.19904026980363976, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.4620732367038727, + "rewards/thk_ans_format_reward": 1.0, + "step": 149, + "think_completion_length": 69.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.734375, + "epoch": 0.25295109612141653, + "grad_norm": 4.872750940333902, + "kl": 0.10546875, + "learning_rate": 9.494097807757167e-07, + "loss": 0.0001, + "reward": 3.1407504081726074, + "reward_std": 0.1939391940832138, + "rewards/final_reward": 1.8599050823740502, + "rewards/mask_iou_reward": 0.9299525411870251, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1407504081726074, + "rewards/thk_ans_format_reward": 1.0, + "step": 150, + "think_completion_length": 65.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.765625, + "epoch": 0.25463743676222594, + "grad_norm": 3.2884978489457337, + "kl": 0.099609375, + "learning_rate": 9.490725126475548e-07, + "loss": 0.0001, + "reward": 2.5347702503204346, + "reward_std": 0.2712481617927551, + "rewards/final_reward": 0.6935733601988581, + "rewards/mask_iou_reward": 0.34678668009942903, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5347701907157898, + "rewards/thk_ans_format_reward": 1.0, + "step": 151, + "think_completion_length": 73.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.734375, + "epoch": 0.2563237774030354, + "grad_norm": 5.393263085729027, + "kl": 0.10888671875, + "learning_rate": 9.487352445193929e-07, + "loss": 0.0001, + "reward": 2.9217779636383057, + "reward_std": 0.3797933831810951, + "rewards/final_reward": 1.345318416221491, + "rewards/mask_iou_reward": 0.6726592081107455, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9217780828475952, + "rewards/thk_ans_format_reward": 1.0, + "step": 152, + "think_completion_length": 76.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.328125, + "epoch": 0.2580101180438449, + "grad_norm": 5.26665151298103, + "kl": 0.1240234375, + "learning_rate": 9.48397976391231e-07, + "loss": 0.0001, + "reward": 3.006482243537903, + "reward_std": 0.37782153487205505, + "rewards/final_reward": 1.1417938718538108, + "rewards/mask_iou_reward": 0.5708969359269054, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0064822435379028, + "rewards/thk_ans_format_reward": 1.0, + "step": 153, + "think_completion_length": 69.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.640625, + "epoch": 0.2596964586846543, + "grad_norm": 6.030885434029907, + "kl": 0.111572265625, + "learning_rate": 9.48060708263069e-07, + "loss": 0.0001, + "reward": 2.444668173789978, + "reward_std": 0.2628085985779762, + "rewards/final_reward": 0.01806783909522232, + "rewards/mask_iou_reward": 0.00903391954761116, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.4446682594716549, + "rewards/thk_ans_format_reward": 1.0, + "step": 154, + "think_completion_length": 65.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.703125, + "epoch": 0.26138279932546377, + "grad_norm": 13.520169048491766, + "kl": 0.126953125, + "learning_rate": 9.477234401349072e-07, + "loss": 0.0001, + "reward": 2.868806838989258, + "reward_std": 0.4705194979906082, + "rewards/final_reward": 0.8561282192138441, + "rewards/mask_iou_reward": 0.42806410960692204, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8688068389892578, + "rewards/thk_ans_format_reward": 1.0, + "step": 155, + "think_completion_length": 55.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.90625, + "epoch": 0.2630691399662732, + "grad_norm": 13.417822343339733, + "kl": 0.122314453125, + "learning_rate": 9.473861720067453e-07, + "loss": 0.0001, + "reward": 3.0076318979263306, + "reward_std": 0.303857646882534, + "rewards/final_reward": 0.8894650284318986, + "rewards/mask_iou_reward": 0.4447325142159493, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0076318979263306, + "rewards/thk_ans_format_reward": 1.0, + "step": 156, + "think_completion_length": 72.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.6875, + "epoch": 0.26475548060708265, + "grad_norm": 8.72973658808703, + "kl": 0.14404296875, + "learning_rate": 9.470489038785834e-07, + "loss": 0.0001, + "reward": 2.947430729866028, + "reward_std": 0.15501541644334793, + "rewards/final_reward": 0.2652462593609649, + "rewards/mask_iou_reward": 0.13262312968048245, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9474307894706726, + "rewards/thk_ans_format_reward": 1.0, + "step": 157, + "think_completion_length": 72.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.046875, + "epoch": 0.26644182124789206, + "grad_norm": 6.762615497616932, + "kl": 0.11279296875, + "learning_rate": 9.467116357504216e-07, + "loss": 0.0001, + "reward": 3.3199820518493652, + "reward_std": 0.25218017399311066, + "rewards/final_reward": 1.0872463613153267, + "rewards/mask_iou_reward": 0.5436231806576634, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.31998211145401, + "rewards/thk_ans_format_reward": 1.0, + "step": 158, + "think_completion_length": 67.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.828125, + "epoch": 0.26812816188870153, + "grad_norm": 5.092870880513967, + "kl": 0.10546875, + "learning_rate": 9.463743676222597e-07, + "loss": 0.0001, + "reward": 2.7609550952911377, + "reward_std": 0.3843376636505127, + "rewards/final_reward": 0.8584499362887056, + "rewards/mask_iou_reward": 0.4292249681443528, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7609550952911377, + "rewards/thk_ans_format_reward": 1.0, + "step": 159, + "think_completion_length": 78.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.0625, + "epoch": 0.26981450252951095, + "grad_norm": 5.147772569205811, + "kl": 0.1318359375, + "learning_rate": 9.460370994940977e-07, + "loss": 0.0001, + "reward": 3.031912684440613, + "reward_std": 0.4968326687812805, + "rewards/final_reward": 0.9247348266948624, + "rewards/mask_iou_reward": 0.4623674133474312, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0319127440452576, + "rewards/thk_ans_format_reward": 1.0, + "step": 160, + "think_completion_length": 64.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.40625, + "epoch": 0.2715008431703204, + "grad_norm": 4.8651031750590406, + "kl": 0.118408203125, + "learning_rate": 9.456998313659359e-07, + "loss": 0.0001, + "reward": 2.696570873260498, + "reward_std": 0.3566492199897766, + "rewards/final_reward": 0.5540198197293646, + "rewards/mask_iou_reward": 0.2770099098646823, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.7121958136558533, + "rewards/thk_ans_format_reward": 1.0, + "step": 161, + "think_completion_length": 73.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.1875, + "epoch": 0.27318718381112983, + "grad_norm": 10.86627896290096, + "kl": 0.12841796875, + "learning_rate": 9.453625632377739e-07, + "loss": 0.0001, + "reward": 2.649171829223633, + "reward_std": 0.3913609981536865, + "rewards/final_reward": 0.5830436964675607, + "rewards/mask_iou_reward": 0.29152184823378036, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6491719186306, + "rewards/thk_ans_format_reward": 1.0, + "step": 162, + "think_completion_length": 65.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.984375, + "epoch": 0.2748735244519393, + "grad_norm": 5.681694741511601, + "kl": 0.110107421875, + "learning_rate": 9.450252951096121e-07, + "loss": 0.0001, + "reward": 2.69570529460907, + "reward_std": 0.3432028442621231, + "rewards/final_reward": 0.9890574993933601, + "rewards/mask_iou_reward": 0.49452874969668004, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.695705458521843, + "rewards/thk_ans_format_reward": 1.0, + "step": 163, + "think_completion_length": 63.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.46875, + "epoch": 0.2765598650927487, + "grad_norm": 21.25176205198972, + "kl": 0.1181640625, + "learning_rate": 9.446880269814502e-07, + "loss": 0.0001, + "reward": 2.522574782371521, + "reward_std": 0.27491385489702225, + "rewards/final_reward": 0.2878840173621903, + "rewards/mask_iou_reward": 0.14394200868109516, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.522574707865715, + "rewards/thk_ans_format_reward": 1.0, + "step": 164, + "think_completion_length": 58.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.265625, + "epoch": 0.2782462057335582, + "grad_norm": 5.401276166916328, + "kl": 0.13037109375, + "learning_rate": 9.443507588532883e-07, + "loss": 0.0001, + "reward": 2.662353754043579, + "reward_std": 0.2629779279232025, + "rewards/final_reward": 1.0439006900901509, + "rewards/mask_iou_reward": 0.5219503450450754, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6779787838459015, + "rewards/thk_ans_format_reward": 0.984375, + "step": 165, + "think_completion_length": 60.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.21875, + "epoch": 0.2799325463743676, + "grad_norm": 10.639716221887245, + "kl": 0.1279296875, + "learning_rate": 9.440134907251265e-07, + "loss": 0.0001, + "reward": 2.6253796815872192, + "reward_std": 0.2937234491109848, + "rewards/final_reward": 0.8173267802429207, + "rewards/mask_iou_reward": 0.40866339012146036, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6253797858953476, + "rewards/thk_ans_format_reward": 1.0, + "step": 166, + "think_completion_length": 61.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.46875, + "epoch": 0.28161888701517707, + "grad_norm": 4.376668469291415, + "kl": 0.12744140625, + "learning_rate": 9.436762225969646e-07, + "loss": 0.0001, + "reward": 2.68786883354187, + "reward_std": 0.1904464066028595, + "rewards/final_reward": 0.1009874731727318, + "rewards/mask_iou_reward": 0.0504937365863659, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.687868744134903, + "rewards/thk_ans_format_reward": 1.0, + "step": 167, + "think_completion_length": 70.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.796875, + "epoch": 0.28330522765598654, + "grad_norm": 4.162857484309753, + "kl": 0.1162109375, + "learning_rate": 9.433389544688027e-07, + "loss": 0.0001, + "reward": 2.4673460721969604, + "reward_std": 0.14471174776554108, + "rewards/final_reward": 0.6801467773036485, + "rewards/mask_iou_reward": 0.34007338865182424, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.4673460125923157, + "rewards/thk_ans_format_reward": 1.0, + "step": 168, + "think_completion_length": 70.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.5625, + "epoch": 0.28499156829679595, + "grad_norm": 5.8655442391890285, + "kl": 0.2138671875, + "learning_rate": 9.430016863406409e-07, + "loss": 0.0002, + "reward": 2.7184778451919556, + "reward_std": 0.3520616292953491, + "rewards/final_reward": 1.123103388258644, + "rewards/mask_iou_reward": 0.561551694129322, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7341028153896332, + "rewards/thk_ans_format_reward": 0.984375, + "step": 169, + "think_completion_length": 62.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.796875, + "epoch": 0.2866779089376054, + "grad_norm": 11.32268035590721, + "kl": 0.12255859375, + "learning_rate": 9.426644182124788e-07, + "loss": 0.0001, + "reward": 2.7999212741851807, + "reward_std": 0.3271795064210892, + "rewards/final_reward": 0.5870644804219973, + "rewards/mask_iou_reward": 0.29353224021099866, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7999212741851807, + "rewards/thk_ans_format_reward": 1.0, + "step": 170, + "think_completion_length": 60.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.84375, + "epoch": 0.28836424957841483, + "grad_norm": 8.834700309247399, + "kl": 0.11962890625, + "learning_rate": 9.423271500843169e-07, + "loss": 0.0001, + "reward": 2.8714534044265747, + "reward_std": 0.44849054515361786, + "rewards/final_reward": 1.0860856449425818, + "rewards/mask_iou_reward": 0.5430428224712909, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8714532852172852, + "rewards/thk_ans_format_reward": 1.0, + "step": 171, + "think_completion_length": 70.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.796875, + "epoch": 0.2900505902192243, + "grad_norm": 5.198743846520378, + "kl": 0.1474609375, + "learning_rate": 9.419898819561551e-07, + "loss": 0.0001, + "reward": 3.1690046787261963, + "reward_std": 0.13895303010940552, + "rewards/final_reward": 0.9271436176614136, + "rewards/mask_iou_reward": 0.4635718088307068, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.169004738330841, + "rewards/thk_ans_format_reward": 1.0, + "step": 172, + "think_completion_length": 61.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.859375, + "epoch": 0.2917369308600337, + "grad_norm": 7.211165474999571, + "kl": 0.13525390625, + "learning_rate": 9.416526138279932e-07, + "loss": 0.0001, + "reward": 3.202454924583435, + "reward_std": 0.4479510486125946, + "rewards/final_reward": 1.5023353211161545, + "rewards/mask_iou_reward": 0.7511676605580773, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2024548053741455, + "rewards/thk_ans_format_reward": 1.0, + "step": 173, + "think_completion_length": 69.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.953125, + "epoch": 0.2934232715008432, + "grad_norm": 4.313657233067247, + "kl": 0.12841796875, + "learning_rate": 9.413153456998313e-07, + "loss": 0.0001, + "reward": 2.9632985591888428, + "reward_std": 0.31960703432559967, + "rewards/final_reward": 0.4206897495451917, + "rewards/mask_iou_reward": 0.21034487477259586, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9632984697818756, + "rewards/thk_ans_format_reward": 1.0, + "step": 174, + "think_completion_length": 64.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.203125, + "epoch": 0.2951096121416526, + "grad_norm": 7.5441699247531675, + "kl": 0.128173828125, + "learning_rate": 9.409780775716695e-07, + "loss": 0.0001, + "reward": 2.729974389076233, + "reward_std": 0.20594902336597443, + "rewards/final_reward": 1.2675996908269855, + "rewards/mask_iou_reward": 0.6337998454134928, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7299742102622986, + "rewards/thk_ans_format_reward": 1.0, + "step": 175, + "think_completion_length": 59.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.890625, + "epoch": 0.29679595278246207, + "grad_norm": 5.8778504219298675, + "kl": 0.14404296875, + "learning_rate": 9.406408094435076e-07, + "loss": 0.0001, + "reward": 3.54919171333313, + "reward_std": 0.20388521254062653, + "rewards/final_reward": 1.228838939231336, + "rewards/mask_iou_reward": 0.614419469615668, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.549191653728485, + "rewards/thk_ans_format_reward": 1.0, + "step": 176, + "think_completion_length": 62.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.34375, + "epoch": 0.2984822934232715, + "grad_norm": 3.521078841102136, + "kl": 0.13330078125, + "learning_rate": 9.403035413153457e-07, + "loss": 0.0001, + "reward": 3.1563611030578613, + "reward_std": 0.3507531061768532, + "rewards/final_reward": 1.1977695530795711, + "rewards/mask_iou_reward": 0.5988847765397856, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1563609838485718, + "rewards/thk_ans_format_reward": 1.0, + "step": 177, + "think_completion_length": 59.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.890625, + "epoch": 0.30016863406408095, + "grad_norm": 5.740861912339471, + "kl": 0.1357421875, + "learning_rate": 9.399662731871839e-07, + "loss": 0.0001, + "reward": 2.9897106885910034, + "reward_std": 0.4273761063814163, + "rewards/final_reward": 1.2805484639491822, + "rewards/mask_iou_reward": 0.6402742319745911, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.0209608376026154, + "rewards/thk_ans_format_reward": 0.984375, + "step": 178, + "think_completion_length": 56.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.59375, + "epoch": 0.30185497470489037, + "grad_norm": 9.879177378853775, + "kl": 0.16455078125, + "learning_rate": 9.396290050590218e-07, + "loss": 0.0002, + "reward": 2.9865111112594604, + "reward_std": 0.350925549864769, + "rewards/final_reward": 1.4638744302432896, + "rewards/mask_iou_reward": 0.7319372151216448, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.98651123046875, + "rewards/thk_ans_format_reward": 1.0, + "step": 179, + "think_completion_length": 59.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.21875, + "epoch": 0.30354131534569984, + "grad_norm": 8.287354511345475, + "kl": 0.144775390625, + "learning_rate": 9.392917369308599e-07, + "loss": 0.0001, + "reward": 2.5636744499206543, + "reward_std": 0.4225110709667206, + "rewards/final_reward": 0.8577971128846306, + "rewards/mask_iou_reward": 0.4288985564423153, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5636745691299438, + "rewards/thk_ans_format_reward": 1.0, + "step": 180, + "think_completion_length": 54.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.84375, + "epoch": 0.30522765598650925, + "grad_norm": 4.22167133798815, + "kl": 0.15380859375, + "learning_rate": 9.389544688026981e-07, + "loss": 0.0002, + "reward": 3.118129253387451, + "reward_std": 0.22842250019311905, + "rewards/final_reward": 0.9386232568832861, + "rewards/mask_iou_reward": 0.46931162844164304, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.118129312992096, + "rewards/thk_ans_format_reward": 1.0, + "step": 181, + "think_completion_length": 61.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.34375, + "epoch": 0.3069139966273187, + "grad_norm": 4.843783601225924, + "kl": 0.14697265625, + "learning_rate": 9.386172006745362e-07, + "loss": 0.0001, + "reward": 3.015873432159424, + "reward_std": 0.4006500840187073, + "rewards/final_reward": 1.1393304081924156, + "rewards/mask_iou_reward": 0.5696652040962078, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0158734321594238, + "rewards/thk_ans_format_reward": 1.0, + "step": 182, + "think_completion_length": 58.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.046875, + "epoch": 0.3086003372681282, + "grad_norm": 3.3709277392436303, + "kl": 0.1494140625, + "learning_rate": 9.382799325463743e-07, + "loss": 0.0001, + "reward": 2.794854760169983, + "reward_std": 0.2856273353099823, + "rewards/final_reward": 0.6165165264207468, + "rewards/mask_iou_reward": 0.3082582632103734, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7948549091815948, + "rewards/thk_ans_format_reward": 1.0, + "step": 183, + "think_completion_length": 59.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.75, + "epoch": 0.3102866779089376, + "grad_norm": 6.530725976667846, + "kl": 0.19873046875, + "learning_rate": 9.379426644182125e-07, + "loss": 0.0002, + "reward": 3.2726598978042603, + "reward_std": 0.2622811198234558, + "rewards/final_reward": 0.9330878932926485, + "rewards/mask_iou_reward": 0.46654394664632426, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2726598978042603, + "rewards/thk_ans_format_reward": 1.0, + "step": 184, + "think_completion_length": 59.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.6875, + "epoch": 0.31197301854974707, + "grad_norm": 5.970535259420742, + "kl": 0.145263671875, + "learning_rate": 9.376053962900506e-07, + "loss": 0.0001, + "reward": 2.518738865852356, + "reward_std": 0.3007535934448242, + "rewards/final_reward": 0.295968154432344, + "rewards/mask_iou_reward": 0.147984077216172, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5187387764453888, + "rewards/thk_ans_format_reward": 1.0, + "step": 185, + "think_completion_length": 57.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.40625, + "epoch": 0.3136593591905565, + "grad_norm": 7.038454246978713, + "kl": 0.1533203125, + "learning_rate": 9.372681281618887e-07, + "loss": 0.0002, + "reward": 2.8668417930603027, + "reward_std": 0.34225399792194366, + "rewards/final_reward": 0.5465141226757314, + "rewards/mask_iou_reward": 0.2732570613378657, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.8824667930603027, + "rewards/thk_ans_format_reward": 1.0, + "step": 186, + "think_completion_length": 59.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.546875, + "epoch": 0.31534569983136596, + "grad_norm": 18.462270438443714, + "kl": 0.14599609375, + "learning_rate": 9.369308600337267e-07, + "loss": 0.0001, + "reward": 3.488566040992737, + "reward_std": 0.20074902474880219, + "rewards/final_reward": 1.8745138450901717, + "rewards/mask_iou_reward": 0.9372569225450859, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4885660409927368, + "rewards/thk_ans_format_reward": 1.0, + "step": 187, + "think_completion_length": 58.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.390625, + "epoch": 0.31703204047217537, + "grad_norm": 3.9471191401956762, + "kl": 0.1474609375, + "learning_rate": 9.365935919055648e-07, + "loss": 0.0001, + "reward": 3.4551355838775635, + "reward_std": 0.29292161762714386, + "rewards/final_reward": 1.4607345589720289, + "rewards/mask_iou_reward": 0.7303672794860144, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4551355838775635, + "rewards/thk_ans_format_reward": 1.0, + "step": 188, + "think_completion_length": 62.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.796875, + "epoch": 0.31871838111298484, + "grad_norm": 4.814018672200347, + "kl": 0.134765625, + "learning_rate": 9.36256323777403e-07, + "loss": 0.0001, + "reward": 2.498222589492798, + "reward_std": 0.34490717202425003, + "rewards/final_reward": 0.5208702977899072, + "rewards/mask_iou_reward": 0.2604351488949536, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.49822261929512024, + "rewards/thk_ans_format_reward": 1.0, + "step": 189, + "think_completion_length": 64.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.421875, + "epoch": 0.32040472175379425, + "grad_norm": 4.210162438049502, + "kl": 0.1533203125, + "learning_rate": 9.359190556492411e-07, + "loss": 0.0002, + "reward": 2.855989098548889, + "reward_std": 0.2442128323018551, + "rewards/final_reward": 1.5877153634620145, + "rewards/mask_iou_reward": 0.7938576817310072, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8559889495372772, + "rewards/thk_ans_format_reward": 1.0, + "step": 190, + "think_completion_length": 63.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.1875, + "epoch": 0.3220910623946037, + "grad_norm": 5.6369854227467835, + "kl": 0.1591796875, + "learning_rate": 9.355817875210792e-07, + "loss": 0.0002, + "reward": 2.8796567916870117, + "reward_std": 0.24023566395044327, + "rewards/final_reward": 1.531722567489063, + "rewards/mask_iou_reward": 0.7658612837445316, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.879656970500946, + "rewards/thk_ans_format_reward": 1.0, + "step": 191, + "think_completion_length": 55.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.34375, + "epoch": 0.32377740303541314, + "grad_norm": 6.757754073902499, + "kl": 0.142578125, + "learning_rate": 9.352445193929174e-07, + "loss": 0.0001, + "reward": 3.378178596496582, + "reward_std": 0.29115166515111923, + "rewards/final_reward": 1.6156733867945872, + "rewards/mask_iou_reward": 0.8078366933972936, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.378178596496582, + "rewards/thk_ans_format_reward": 1.0, + "step": 192, + "think_completion_length": 65.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.625, + "epoch": 0.3254637436762226, + "grad_norm": 19.7363508642686, + "kl": 0.1484375, + "learning_rate": 9.349072512647555e-07, + "loss": 0.0001, + "reward": 2.5394753217697144, + "reward_std": 0.15710114687681198, + "rewards/final_reward": 0.2791865142332825, + "rewards/mask_iou_reward": 0.13959325711664125, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5394753515720367, + "rewards/thk_ans_format_reward": 1.0, + "step": 193, + "think_completion_length": 53.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.828125, + "epoch": 0.327150084317032, + "grad_norm": 4.867914820040121, + "kl": 0.15283203125, + "learning_rate": 9.345699831365936e-07, + "loss": 0.0002, + "reward": 3.283694863319397, + "reward_std": 0.3259827569127083, + "rewards/final_reward": 1.0945420370292709, + "rewards/mask_iou_reward": 0.5472710185146354, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.283694863319397, + "rewards/thk_ans_format_reward": 1.0, + "step": 194, + "think_completion_length": 56.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.84375, + "epoch": 0.3288364249578415, + "grad_norm": 8.986367787783225, + "kl": 0.14794921875, + "learning_rate": 9.342327150084317e-07, + "loss": 0.0001, + "reward": 3.0662416219711304, + "reward_std": 0.5097359418869019, + "rewards/final_reward": 1.0575091683178994, + "rewards/mask_iou_reward": 0.5287545841589497, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0662416219711304, + "rewards/thk_ans_format_reward": 1.0, + "step": 195, + "think_completion_length": 68.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.96875, + "epoch": 0.3305227655986509, + "grad_norm": 4.458643815289693, + "kl": 0.17822265625, + "learning_rate": 9.338954468802697e-07, + "loss": 0.0002, + "reward": 2.6947516202926636, + "reward_std": 0.3916451036930084, + "rewards/final_reward": 0.6320579136922702, + "rewards/mask_iou_reward": 0.3160289568461351, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6947516202926636, + "rewards/thk_ans_format_reward": 1.0, + "step": 196, + "think_completion_length": 61.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.296875, + "epoch": 0.33220910623946037, + "grad_norm": 8.627113179501139, + "kl": 0.124267578125, + "learning_rate": 9.335581787521078e-07, + "loss": 0.0001, + "reward": 2.435835838317871, + "reward_std": 0.2502327188849449, + "rewards/final_reward": 0.8284848647940978, + "rewards/mask_iou_reward": 0.4142424323970489, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.43583589792251587, + "rewards/thk_ans_format_reward": 1.0, + "step": 197, + "think_completion_length": 66.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.484375, + "epoch": 0.33389544688026984, + "grad_norm": 10.433763299061711, + "kl": 0.1923828125, + "learning_rate": 9.33220910623946e-07, + "loss": 0.0002, + "reward": 2.810659885406494, + "reward_std": 0.2830319292843342, + "rewards/final_reward": 1.0020295015503957, + "rewards/mask_iou_reward": 0.5010147507751979, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8106599748134613, + "rewards/thk_ans_format_reward": 1.0, + "step": 198, + "think_completion_length": 65.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.296875, + "epoch": 0.33558178752107926, + "grad_norm": 4.186736206358417, + "kl": 0.2060546875, + "learning_rate": 9.328836424957841e-07, + "loss": 0.0002, + "reward": 2.5021262168884277, + "reward_std": 0.4148203581571579, + "rewards/final_reward": 1.0089048969836152, + "rewards/mask_iou_reward": 0.5044524484918076, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.5333761423826218, + "rewards/thk_ans_format_reward": 0.984375, + "step": 199, + "think_completion_length": 68.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.90625, + "epoch": 0.3372681281618887, + "grad_norm": 9.22747285482429, + "kl": 0.16259765625, + "learning_rate": 9.325463743676222e-07, + "loss": 0.0002, + "reward": 3.107977509498596, + "reward_std": 0.3112121522426605, + "rewards/final_reward": 1.162340815426343, + "rewards/mask_iou_reward": 0.5811704077131715, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1079775094985962, + "rewards/thk_ans_format_reward": 1.0, + "step": 200, + "think_completion_length": 77.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.875, + "epoch": 0.33895446880269814, + "grad_norm": 9.889047145437663, + "kl": 0.18896484375, + "learning_rate": 9.322091062394604e-07, + "loss": 0.0002, + "reward": 3.412969708442688, + "reward_std": 0.457067608833313, + "rewards/final_reward": 1.4231911438686162, + "rewards/mask_iou_reward": 0.7115955719343081, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4129695892333984, + "rewards/thk_ans_format_reward": 1.0, + "step": 201, + "think_completion_length": 72.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.5625, + "epoch": 0.3406408094435076, + "grad_norm": 11.631469733744547, + "kl": 0.17578125, + "learning_rate": 9.318718381112985e-07, + "loss": 0.0002, + "reward": 3.0336802005767822, + "reward_std": 0.5069368779659271, + "rewards/final_reward": 0.775275597891573, + "rewards/mask_iou_reward": 0.3876377989457865, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.0649300515651703, + "rewards/thk_ans_format_reward": 0.984375, + "step": 202, + "think_completion_length": 76.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.484375, + "epoch": 0.342327150084317, + "grad_norm": 4.719467738906433, + "kl": 0.19921875, + "learning_rate": 9.315345699831365e-07, + "loss": 0.0002, + "reward": 2.905851364135742, + "reward_std": 0.3175694327801466, + "rewards/final_reward": 1.1055562593125423, + "rewards/mask_iou_reward": 0.5527781296562712, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.9371014535427094, + "rewards/thk_ans_format_reward": 0.984375, + "step": 203, + "think_completion_length": 78.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.859375, + "epoch": 0.3440134907251265, + "grad_norm": 5.397178764490908, + "kl": 0.216796875, + "learning_rate": 9.311973018549747e-07, + "loss": 0.0002, + "reward": 3.009737014770508, + "reward_std": 0.13770561665296555, + "rewards/final_reward": 1.4083112152036812, + "rewards/mask_iou_reward": 0.7041556076018406, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.009736955165863, + "rewards/thk_ans_format_reward": 1.0, + "step": 204, + "think_completion_length": 86.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.84375, + "epoch": 0.3456998313659359, + "grad_norm": 3.3032440569737367, + "kl": 0.19921875, + "learning_rate": 9.308600337268127e-07, + "loss": 0.0002, + "reward": 2.865081787109375, + "reward_std": 0.4096911549568176, + "rewards/final_reward": 1.1043083846033233, + "rewards/mask_iou_reward": 0.5521541923016616, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8650819063186646, + "rewards/thk_ans_format_reward": 1.0, + "step": 205, + "think_completion_length": 87.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.140625, + "epoch": 0.3473861720067454, + "grad_norm": 10.093800490282504, + "kl": 0.1943359375, + "learning_rate": 9.305227655986508e-07, + "loss": 0.0002, + "reward": 2.715272068977356, + "reward_std": 0.21183528751134872, + "rewards/final_reward": 0.6873399231732058, + "rewards/mask_iou_reward": 0.3436699615866029, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7152720987796783, + "rewards/thk_ans_format_reward": 1.0, + "step": 206, + "think_completion_length": 74.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.171875, + "epoch": 0.3490725126475548, + "grad_norm": 4.865522460663028, + "kl": 0.21435546875, + "learning_rate": 9.30185497470489e-07, + "loss": 0.0002, + "reward": 3.131924629211426, + "reward_std": 0.3374939039349556, + "rewards/final_reward": 0.5539088155304328, + "rewards/mask_iou_reward": 0.2769544077652164, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1319246292114258, + "rewards/thk_ans_format_reward": 1.0, + "step": 207, + "think_completion_length": 85.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.703125, + "epoch": 0.35075885328836426, + "grad_norm": 4.473708487184476, + "kl": 0.16455078125, + "learning_rate": 9.298482293423271e-07, + "loss": 0.0002, + "reward": 2.883733034133911, + "reward_std": 0.40940313041210175, + "rewards/final_reward": 1.436671863940748, + "rewards/mask_iou_reward": 0.718335931970374, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.9149829745292664, + "rewards/thk_ans_format_reward": 0.984375, + "step": 208, + "think_completion_length": 97.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.921875, + "epoch": 0.3524451939291737, + "grad_norm": 23.788673448569757, + "kl": 0.18408203125, + "learning_rate": 9.295109612141652e-07, + "loss": 0.0002, + "reward": 2.941943883895874, + "reward_std": 0.4504036456346512, + "rewards/final_reward": 0.470037845282397, + "rewards/mask_iou_reward": 0.2350189226411985, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 1.0044439435005188, + "rewards/thk_ans_format_reward": 0.96875, + "step": 209, + "think_completion_length": 75.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.328125, + "epoch": 0.35413153456998314, + "grad_norm": 5.372868142835202, + "kl": 0.24853515625, + "learning_rate": 9.291736930860034e-07, + "loss": 0.0002, + "reward": 2.724569797515869, + "reward_std": 0.41185761988162994, + "rewards/final_reward": 0.7104813213657171, + "rewards/mask_iou_reward": 0.35524066068285853, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.7558198869228363, + "rewards/thk_ans_format_reward": 0.984375, + "step": 210, + "think_completion_length": 71.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.265625, + "epoch": 0.35581787521079256, + "grad_norm": 7.693384240552429, + "kl": 0.23828125, + "learning_rate": 9.288364249578415e-07, + "loss": 0.0002, + "reward": 3.142953395843506, + "reward_std": 0.4350839853286743, + "rewards/final_reward": 1.0316405850495944, + "rewards/mask_iou_reward": 0.5158202925247972, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1429533958435059, + "rewards/thk_ans_format_reward": 1.0, + "step": 211, + "think_completion_length": 67.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.078125, + "epoch": 0.357504215851602, + "grad_norm": 24.52565671704594, + "kl": 0.189453125, + "learning_rate": 9.284991568296796e-07, + "loss": 0.0002, + "reward": 3.507409691810608, + "reward_std": 0.2794800400733948, + "rewards/final_reward": 1.511632496053278, + "rewards/mask_iou_reward": 0.755816248026639, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5074098706245422, + "rewards/thk_ans_format_reward": 1.0, + "step": 212, + "think_completion_length": 76.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.875, + "epoch": 0.3591905564924115, + "grad_norm": 104.11907087866024, + "kl": 0.2138671875, + "learning_rate": 9.281618887015177e-07, + "loss": 0.0002, + "reward": 3.1027177572250366, + "reward_std": 0.3239024728536606, + "rewards/final_reward": 1.0709143355153474, + "rewards/mask_iou_reward": 0.5354571677576737, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1027177274227142, + "rewards/thk_ans_format_reward": 1.0, + "step": 213, + "think_completion_length": 76.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.84375, + "epoch": 0.3608768971332209, + "grad_norm": 6.079891869406322, + "kl": 0.22998046875, + "learning_rate": 9.278246205733557e-07, + "loss": 0.0002, + "reward": 3.1863114833831787, + "reward_std": 0.35681121051311493, + "rewards/final_reward": 1.0386986981609376, + "rewards/mask_iou_reward": 0.5193493490804688, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1863115429878235, + "rewards/thk_ans_format_reward": 1.0, + "step": 214, + "think_completion_length": 81.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.015625, + "epoch": 0.3625632377740304, + "grad_norm": 20.992549608832288, + "kl": 0.18701171875, + "learning_rate": 9.274873524451939e-07, + "loss": 0.0002, + "reward": 3.2150684595108032, + "reward_std": 0.3001432493329048, + "rewards/final_reward": 1.4833759696702087, + "rewards/mask_iou_reward": 0.7416879848351043, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2150683999061584, + "rewards/thk_ans_format_reward": 1.0, + "step": 215, + "think_completion_length": 77.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.15625, + "epoch": 0.3642495784148398, + "grad_norm": 3.985537844159363, + "kl": 0.21484375, + "learning_rate": 9.27150084317032e-07, + "loss": 0.0002, + "reward": 2.9944499731063843, + "reward_std": 0.40752771496772766, + "rewards/final_reward": 0.9913912240867233, + "rewards/mask_iou_reward": 0.49569561204336166, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9944499731063843, + "rewards/thk_ans_format_reward": 1.0, + "step": 216, + "think_completion_length": 75.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.90625, + "epoch": 0.36593591905564926, + "grad_norm": 12.055847835960044, + "kl": 0.2060546875, + "learning_rate": 9.268128161888701e-07, + "loss": 0.0002, + "reward": 3.2611602544784546, + "reward_std": 0.2674819827079773, + "rewards/final_reward": 1.1373292199253833, + "rewards/mask_iou_reward": 0.5686646099626916, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2611603140830994, + "rewards/thk_ans_format_reward": 1.0, + "step": 217, + "think_completion_length": 75.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.421875, + "epoch": 0.3676222596964587, + "grad_norm": 4.2602635786354535, + "kl": 0.208984375, + "learning_rate": 9.264755480607083e-07, + "loss": 0.0002, + "reward": 2.6974871158599854, + "reward_std": 0.3205343186855316, + "rewards/final_reward": 0.19768463854294763, + "rewards/mask_iou_reward": 0.09884231927147381, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6974871754646301, + "rewards/thk_ans_format_reward": 1.0, + "step": 218, + "think_completion_length": 82.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.703125, + "epoch": 0.36930860033726814, + "grad_norm": 5.154948821299118, + "kl": 0.20458984375, + "learning_rate": 9.261382799325464e-07, + "loss": 0.0002, + "reward": 3.2684760093688965, + "reward_std": 0.37709657847881317, + "rewards/final_reward": 1.0383823792548679, + "rewards/mask_iou_reward": 0.5191911896274339, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2684761881828308, + "rewards/thk_ans_format_reward": 1.0, + "step": 219, + "think_completion_length": 86.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.546875, + "epoch": 0.37099494097807756, + "grad_norm": 10.042230308759423, + "kl": 0.22021484375, + "learning_rate": 9.258010118043844e-07, + "loss": 0.0002, + "reward": 2.912859559059143, + "reward_std": 0.18924781680107117, + "rewards/final_reward": 0.8126117230948996, + "rewards/mask_iou_reward": 0.4063058615474498, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9128594994544983, + "rewards/thk_ans_format_reward": 1.0, + "step": 220, + "think_completion_length": 69.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.25, + "epoch": 0.37268128161888703, + "grad_norm": 5.580949729321493, + "kl": 0.21826171875, + "learning_rate": 9.254637436762226e-07, + "loss": 0.0002, + "reward": 2.6421183347702026, + "reward_std": 0.21327205747365952, + "rewards/final_reward": 0.38601525011759963, + "rewards/mask_iou_reward": 0.19300762505879981, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6421183943748474, + "rewards/thk_ans_format_reward": 1.0, + "step": 221, + "think_completion_length": 69.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.828125, + "epoch": 0.37436762225969644, + "grad_norm": 4.732336023491863, + "kl": 0.26611328125, + "learning_rate": 9.251264755480606e-07, + "loss": 0.0003, + "reward": 3.230614185333252, + "reward_std": 0.29589555226266384, + "rewards/final_reward": 1.5953083083239932, + "rewards/mask_iou_reward": 0.7976541541619966, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2306141257286072, + "rewards/thk_ans_format_reward": 1.0, + "step": 222, + "think_completion_length": 68.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.421875, + "epoch": 0.3760539629005059, + "grad_norm": 18.74043446734351, + "kl": 0.197265625, + "learning_rate": 9.247892074198987e-07, + "loss": 0.0002, + "reward": 2.9248836040496826, + "reward_std": 0.1982583925127983, + "rewards/final_reward": 1.5204117591821684, + "rewards/mask_iou_reward": 0.7602058795910842, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9248837232589722, + "rewards/thk_ans_format_reward": 1.0, + "step": 223, + "think_completion_length": 82.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.203125, + "epoch": 0.3777403035413153, + "grad_norm": 4.867690589893153, + "kl": 0.2177734375, + "learning_rate": 9.244519392917369e-07, + "loss": 0.0002, + "reward": 2.6547787189483643, + "reward_std": 0.2125616818666458, + "rewards/final_reward": 0.030600702763538143, + "rewards/mask_iou_reward": 0.015300351381769071, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6547787487506866, + "rewards/thk_ans_format_reward": 1.0, + "step": 224, + "think_completion_length": 81.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.015625, + "epoch": 0.3794266441821248, + "grad_norm": 5.204370347747985, + "kl": 0.22021484375, + "learning_rate": 9.24114671163575e-07, + "loss": 0.0002, + "reward": 3.1091920137405396, + "reward_std": 0.3276357799768448, + "rewards/final_reward": 1.1539964994728786, + "rewards/mask_iou_reward": 0.5769982497364393, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1091918349266052, + "rewards/thk_ans_format_reward": 1.0, + "step": 225, + "think_completion_length": 76.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.40625, + "epoch": 0.3811129848229342, + "grad_norm": 7.562543453423681, + "kl": 0.205078125, + "learning_rate": 9.237774030354131e-07, + "loss": 0.0002, + "reward": 3.0599652528762817, + "reward_std": 0.5095875263214111, + "rewards/final_reward": 0.9059710345682561, + "rewards/mask_iou_reward": 0.45298551728412806, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.059965193271637, + "rewards/thk_ans_format_reward": 1.0, + "step": 226, + "think_completion_length": 68.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.65625, + "epoch": 0.3827993254637437, + "grad_norm": 22.849487722280188, + "kl": 0.2255859375, + "learning_rate": 9.234401349072513e-07, + "loss": 0.0002, + "reward": 3.07417631149292, + "reward_std": 0.37541690468788147, + "rewards/final_reward": 1.455683295385246, + "rewards/mask_iou_reward": 0.727841647692623, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0741761326789856, + "rewards/thk_ans_format_reward": 1.0, + "step": 227, + "think_completion_length": 66.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.921875, + "epoch": 0.3844856661045531, + "grad_norm": 3.2900971059049224, + "kl": 0.19873046875, + "learning_rate": 9.231028667790893e-07, + "loss": 0.0002, + "reward": 3.077646255493164, + "reward_std": 0.4934057295322418, + "rewards/final_reward": 1.4828937291881843, + "rewards/mask_iou_reward": 0.7414468645940921, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.1088961362838745, + "rewards/thk_ans_format_reward": 0.984375, + "step": 228, + "think_completion_length": 90.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.546875, + "epoch": 0.38617200674536256, + "grad_norm": 6.725551609246913, + "kl": 0.21240234375, + "learning_rate": 9.227655986509274e-07, + "loss": 0.0002, + "reward": 3.021067500114441, + "reward_std": 0.21517714858055115, + "rewards/final_reward": 1.1519169989098978, + "rewards/mask_iou_reward": 0.5759584994549489, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0210675299167633, + "rewards/thk_ans_format_reward": 1.0, + "step": 229, + "think_completion_length": 70.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.734375, + "epoch": 0.38785834738617203, + "grad_norm": 4.0711352651747195, + "kl": 0.205078125, + "learning_rate": 9.224283305227656e-07, + "loss": 0.0002, + "reward": 2.8537700176239014, + "reward_std": 0.37378963828086853, + "rewards/final_reward": 0.6391967309039653, + "rewards/mask_iou_reward": 0.31959836545198267, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8537698686122894, + "rewards/thk_ans_format_reward": 1.0, + "step": 230, + "think_completion_length": 85.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.625, + "epoch": 0.38954468802698144, + "grad_norm": 7.15848584277918, + "kl": 0.216796875, + "learning_rate": 9.220910623946036e-07, + "loss": 0.0002, + "reward": 3.125742197036743, + "reward_std": 0.26193077489733696, + "rewards/final_reward": 1.075020571264583, + "rewards/mask_iou_reward": 0.5375102856322915, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.125742256641388, + "rewards/thk_ans_format_reward": 1.0, + "step": 231, + "think_completion_length": 63.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.59375, + "epoch": 0.3912310286677909, + "grad_norm": 7.744629541689001, + "kl": 0.333984375, + "learning_rate": 9.217537942664417e-07, + "loss": 0.0003, + "reward": 3.0868237018585205, + "reward_std": 0.41837984323501587, + "rewards/final_reward": 0.9561375750120857, + "rewards/mask_iou_reward": 0.47806878750604287, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.08682382106781, + "rewards/thk_ans_format_reward": 1.0, + "step": 232, + "think_completion_length": 73.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.453125, + "epoch": 0.39291736930860033, + "grad_norm": 8.404435287451914, + "kl": 0.212890625, + "learning_rate": 9.214165261382799e-07, + "loss": 0.0002, + "reward": 2.928203582763672, + "reward_std": 0.4975929260253906, + "rewards/final_reward": 0.5828971869005508, + "rewards/mask_iou_reward": 0.2914485934502754, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9282036423683167, + "rewards/thk_ans_format_reward": 1.0, + "step": 233, + "think_completion_length": 65.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.96875, + "epoch": 0.3946037099494098, + "grad_norm": 14.476592822333908, + "kl": 0.19140625, + "learning_rate": 9.21079258010118e-07, + "loss": 0.0002, + "reward": 2.635561227798462, + "reward_std": 0.32975369691848755, + "rewards/final_reward": 0.9387983024693395, + "rewards/mask_iou_reward": 0.46939915123466974, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6355613172054291, + "rewards/thk_ans_format_reward": 1.0, + "step": 234, + "think_completion_length": 80.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.765625, + "epoch": 0.3962900505902192, + "grad_norm": 5.813593781329736, + "kl": 0.24169921875, + "learning_rate": 9.207419898819561e-07, + "loss": 0.0002, + "reward": 3.0836466550827026, + "reward_std": 0.23019906878471375, + "rewards/final_reward": 1.218153597393058, + "rewards/mask_iou_reward": 0.609076798696529, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0836465060710907, + "rewards/thk_ans_format_reward": 1.0, + "step": 235, + "think_completion_length": 81.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.859375, + "epoch": 0.3979763912310287, + "grad_norm": 7.446504853574746, + "kl": 0.2236328125, + "learning_rate": 9.204047217537943e-07, + "loss": 0.0002, + "reward": 2.911117911338806, + "reward_std": 0.2473655566573143, + "rewards/final_reward": 0.6911036456671604, + "rewards/mask_iou_reward": 0.3455518228335802, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9111178815364838, + "rewards/thk_ans_format_reward": 1.0, + "step": 236, + "think_completion_length": 58.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.046875, + "epoch": 0.3996627318718381, + "grad_norm": 6.320005332811707, + "kl": 0.2216796875, + "learning_rate": 9.200674536256323e-07, + "loss": 0.0002, + "reward": 2.954068422317505, + "reward_std": 0.4168316461145878, + "rewards/final_reward": 0.639526934138163, + "rewards/mask_iou_reward": 0.3197634670690815, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9540683627128601, + "rewards/thk_ans_format_reward": 1.0, + "step": 237, + "think_completion_length": 71.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.09375, + "epoch": 0.40134907251264756, + "grad_norm": 5.998901795263871, + "kl": 0.26318359375, + "learning_rate": 9.197301854974705e-07, + "loss": 0.0003, + "reward": 3.161041498184204, + "reward_std": 0.17126264609396458, + "rewards/final_reward": 0.8910959114004183, + "rewards/mask_iou_reward": 0.44554795570020916, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.161041498184204, + "rewards/thk_ans_format_reward": 1.0, + "step": 238, + "think_completion_length": 57.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.328125, + "epoch": 0.403035413153457, + "grad_norm": 7.6852691733431655, + "kl": 0.22314453125, + "learning_rate": 9.193929173693086e-07, + "loss": 0.0002, + "reward": 2.8813143968582153, + "reward_std": 0.2463463842868805, + "rewards/final_reward": 1.1477729166412296, + "rewards/mask_iou_reward": 0.5738864583206148, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8813144266605377, + "rewards/thk_ans_format_reward": 1.0, + "step": 239, + "think_completion_length": 68.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.5, + "epoch": 0.40472175379426645, + "grad_norm": 5.47140326939557, + "kl": 0.3115234375, + "learning_rate": 9.190556492411466e-07, + "loss": 0.0003, + "reward": 2.9627087116241455, + "reward_std": 0.45273715257644653, + "rewards/final_reward": 1.3423065764411553, + "rewards/mask_iou_reward": 0.6711532882205776, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9627088606357574, + "rewards/thk_ans_format_reward": 1.0, + "step": 240, + "think_completion_length": 67.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.34375, + "epoch": 0.40640809443507586, + "grad_norm": 19.829125482202834, + "kl": 0.826171875, + "learning_rate": 9.187183811129848e-07, + "loss": 0.0008, + "reward": 2.9754170179367065, + "reward_std": 0.4061010330915451, + "rewards/final_reward": 1.052455780557629, + "rewards/mask_iou_reward": 0.5262278902788144, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9754170775413513, + "rewards/thk_ans_format_reward": 1.0, + "step": 241, + "think_completion_length": 72.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.15625, + "epoch": 0.40809443507588533, + "grad_norm": 28.243128773381454, + "kl": 0.2041015625, + "learning_rate": 9.183811129848229e-07, + "loss": 0.0002, + "reward": 3.6813935041427612, + "reward_std": 0.1992366872727871, + "rewards/final_reward": 1.8080672760079621, + "rewards/mask_iou_reward": 0.9040336380039811, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6813934445381165, + "rewards/thk_ans_format_reward": 1.0, + "step": 242, + "think_completion_length": 72.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.359375, + "epoch": 0.40978077571669475, + "grad_norm": 140.06539665595105, + "kl": 0.2353515625, + "learning_rate": 9.18043844856661e-07, + "loss": 0.0002, + "reward": 3.1565033197402954, + "reward_std": 0.4416676461696625, + "rewards/final_reward": 1.2314951954134759, + "rewards/mask_iou_reward": 0.6157475977067379, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1565033495426178, + "rewards/thk_ans_format_reward": 1.0, + "step": 243, + "think_completion_length": 75.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.609375, + "epoch": 0.4114671163575042, + "grad_norm": 7.392254590000325, + "kl": 0.23388671875, + "learning_rate": 9.177065767284992e-07, + "loss": 0.0002, + "reward": 2.6299526691436768, + "reward_std": 0.2196236252784729, + "rewards/final_reward": 0.4655239442586937, + "rewards/mask_iou_reward": 0.23276197212934685, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6299527883529663, + "rewards/thk_ans_format_reward": 1.0, + "step": 244, + "think_completion_length": 72.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.015625, + "epoch": 0.4131534569983137, + "grad_norm": 26.232242488237784, + "kl": 0.24609375, + "learning_rate": 9.173693086003372e-07, + "loss": 0.0002, + "reward": 2.877917170524597, + "reward_std": 0.301144540309906, + "rewards/final_reward": 0.19570692172312354, + "rewards/mask_iou_reward": 0.09785346086156177, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8779171109199524, + "rewards/thk_ans_format_reward": 1.0, + "step": 245, + "think_completion_length": 72.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.609375, + "epoch": 0.4148397976391231, + "grad_norm": 5.983543827094884, + "kl": 0.25146484375, + "learning_rate": 9.170320404721753e-07, + "loss": 0.0003, + "reward": 3.2205491065979004, + "reward_std": 0.1830149181187153, + "rewards/final_reward": 1.852683062161943, + "rewards/mask_iou_reward": 0.9263415310809715, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2361740469932556, + "rewards/thk_ans_format_reward": 0.984375, + "step": 246, + "think_completion_length": 66.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.734375, + "epoch": 0.41652613827993257, + "grad_norm": 7.9257714065232685, + "kl": 0.2314453125, + "learning_rate": 9.166947723440135e-07, + "loss": 0.0002, + "reward": 3.2331912517547607, + "reward_std": 0.33761420100927353, + "rewards/final_reward": 1.2358968368078678, + "rewards/mask_iou_reward": 0.6179484184039339, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2331913709640503, + "rewards/thk_ans_format_reward": 1.0, + "step": 247, + "think_completion_length": 76.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.328125, + "epoch": 0.418212478920742, + "grad_norm": 5.404389365456824, + "kl": 0.22119140625, + "learning_rate": 9.163575042158516e-07, + "loss": 0.0002, + "reward": 2.731718420982361, + "reward_std": 0.15819299221038818, + "rewards/final_reward": 0.7831214945094458, + "rewards/mask_iou_reward": 0.3915607472547229, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.7473434042185545, + "rewards/thk_ans_format_reward": 1.0, + "step": 248, + "think_completion_length": 67.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.84375, + "epoch": 0.41989881956155145, + "grad_norm": 6.896631453177303, + "kl": 0.24755859375, + "learning_rate": 9.160202360876896e-07, + "loss": 0.0002, + "reward": 3.7351213693618774, + "reward_std": 0.15707053616642952, + "rewards/final_reward": 1.690610043355829, + "rewards/mask_iou_reward": 0.8453050216779145, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7351213693618774, + "rewards/thk_ans_format_reward": 1.0, + "step": 249, + "think_completion_length": 76.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.53125, + "epoch": 0.42158516020236086, + "grad_norm": 11.797769654791798, + "kl": 0.24560546875, + "learning_rate": 9.156829679595278e-07, + "loss": 0.0002, + "reward": 3.319765567779541, + "reward_std": 0.22760100662708282, + "rewards/final_reward": 1.43913601584751, + "rewards/mask_iou_reward": 0.719568007923755, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.319765418767929, + "rewards/thk_ans_format_reward": 1.0, + "step": 250, + "think_completion_length": 67.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.953125, + "epoch": 0.42327150084317033, + "grad_norm": 4.103837453753516, + "kl": 0.228515625, + "learning_rate": 9.153456998313659e-07, + "loss": 0.0002, + "reward": 3.584295630455017, + "reward_std": 0.15136371925473213, + "rewards/final_reward": 1.371311237407304, + "rewards/mask_iou_reward": 0.685655618703652, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.599920630455017, + "rewards/thk_ans_format_reward": 0.984375, + "step": 251, + "think_completion_length": 79.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.96875, + "epoch": 0.42495784148397975, + "grad_norm": 3.4126528618942507, + "kl": 0.244140625, + "learning_rate": 9.15008431703204e-07, + "loss": 0.0002, + "reward": 3.032737374305725, + "reward_std": 0.218122199177742, + "rewards/final_reward": 0.9424829789653526, + "rewards/mask_iou_reward": 0.4712414894826763, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.032737284898758, + "rewards/thk_ans_format_reward": 1.0, + "step": 252, + "think_completion_length": 72.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.078125, + "epoch": 0.4266441821247892, + "grad_norm": 4.673604751065417, + "kl": 0.265625, + "learning_rate": 9.146711635750421e-07, + "loss": 0.0003, + "reward": 2.7446590662002563, + "reward_std": 0.311612606048584, + "rewards/final_reward": 0.6923241188788728, + "rewards/mask_iou_reward": 0.3461620594394364, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7446590214967728, + "rewards/thk_ans_format_reward": 1.0, + "step": 253, + "think_completion_length": 81.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.9375, + "epoch": 0.42833052276559863, + "grad_norm": 4.250104365560061, + "kl": 0.23486328125, + "learning_rate": 9.143338954468802e-07, + "loss": 0.0002, + "reward": 3.027690887451172, + "reward_std": 0.4209420531988144, + "rewards/final_reward": 0.9340073633977319, + "rewards/mask_iou_reward": 0.46700368169886597, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.02769073843956, + "rewards/thk_ans_format_reward": 1.0, + "step": 254, + "think_completion_length": 71.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.265625, + "epoch": 0.4300168634064081, + "grad_norm": 4.1133314139001, + "kl": 0.23876953125, + "learning_rate": 9.139966273187183e-07, + "loss": 0.0002, + "reward": 3.125454545021057, + "reward_std": 0.4221559911966324, + "rewards/final_reward": 1.0887862009263054, + "rewards/mask_iou_reward": 0.5443931004631527, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1254545748233795, + "rewards/thk_ans_format_reward": 1.0, + "step": 255, + "think_completion_length": 71.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.75, + "epoch": 0.4317032040472175, + "grad_norm": 10.371535649498034, + "kl": 0.21533203125, + "learning_rate": 9.136593591905565e-07, + "loss": 0.0002, + "reward": 3.276609420776367, + "reward_std": 0.35103708505630493, + "rewards/final_reward": 1.1444245548335776, + "rewards/mask_iou_reward": 0.5722122774167888, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2766093015670776, + "rewards/thk_ans_format_reward": 1.0, + "step": 256, + "think_completion_length": 73.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.125, + "epoch": 0.433389544688027, + "grad_norm": 5.379368798881943, + "kl": 0.41845703125, + "learning_rate": 9.133220910623946e-07, + "loss": 0.0004, + "reward": 2.9322704076766968, + "reward_std": 0.2937382832169533, + "rewards/final_reward": 0.9810359015544533, + "rewards/mask_iou_reward": 0.49051795077722665, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9478955268859863, + "rewards/thk_ans_format_reward": 0.984375, + "step": 257, + "think_completion_length": 79.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.375, + "epoch": 0.4350758853288364, + "grad_norm": 4.0113801797767, + "kl": 0.22900390625, + "learning_rate": 9.129848229342326e-07, + "loss": 0.0005, + "reward": 2.6997172832489014, + "reward_std": 0.1351064220070839, + "rewards/final_reward": 0.4088354568064064, + "rewards/mask_iou_reward": 0.2044177284032032, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6997173428535461, + "rewards/thk_ans_format_reward": 1.0, + "step": 258, + "think_completion_length": 81.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.25, + "epoch": 0.43676222596964587, + "grad_norm": 4.619601971068037, + "kl": 0.4501953125, + "learning_rate": 9.126475548060708e-07, + "loss": 0.0004, + "reward": 3.021946907043457, + "reward_std": 0.17659004405140877, + "rewards/final_reward": 1.3774581014446288, + "rewards/mask_iou_reward": 0.6887290507223144, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0219468921422958, + "rewards/thk_ans_format_reward": 1.0, + "step": 259, + "think_completion_length": 81.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.484375, + "epoch": 0.43844856661045534, + "grad_norm": 3.9974465215683224, + "kl": 0.2001953125, + "learning_rate": 9.123102866779089e-07, + "loss": 0.0002, + "reward": 3.290497064590454, + "reward_std": 0.27161210775375366, + "rewards/final_reward": 1.60521735437425, + "rewards/mask_iou_reward": 0.802608677187125, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2904971241950989, + "rewards/thk_ans_format_reward": 1.0, + "step": 260, + "think_completion_length": 83.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.015625, + "epoch": 0.44013490725126475, + "grad_norm": 4.270684837247345, + "kl": 0.2353515625, + "learning_rate": 9.119730185497469e-07, + "loss": 0.0002, + "reward": 2.9826985597610474, + "reward_std": 0.27017855644226074, + "rewards/final_reward": 0.9011500105943151, + "rewards/mask_iou_reward": 0.45057500529715755, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9826985597610474, + "rewards/thk_ans_format_reward": 1.0, + "step": 261, + "think_completion_length": 82.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.46875, + "epoch": 0.4418212478920742, + "grad_norm": 5.01938701583992, + "kl": 0.2294921875, + "learning_rate": 9.116357504215851e-07, + "loss": 0.0002, + "reward": 2.971551537513733, + "reward_std": 0.17417415231466293, + "rewards/final_reward": 0.3612003790476864, + "rewards/mask_iou_reward": 0.1806001895238432, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9715515375137329, + "rewards/thk_ans_format_reward": 1.0, + "step": 262, + "think_completion_length": 91.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.203125, + "epoch": 0.44350758853288363, + "grad_norm": 6.835284114070682, + "kl": 0.2275390625, + "learning_rate": 9.112984822934232e-07, + "loss": 0.0002, + "reward": 3.203929901123047, + "reward_std": 0.32439279556274414, + "rewards/final_reward": 1.011724169976494, + "rewards/mask_iou_reward": 0.505862084988247, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2195549309253693, + "rewards/thk_ans_format_reward": 0.984375, + "step": 263, + "think_completion_length": 87.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.453125, + "epoch": 0.4451939291736931, + "grad_norm": 8.018046696171343, + "kl": 0.23388671875, + "learning_rate": 9.109612141652614e-07, + "loss": 0.0002, + "reward": 2.3503458499908447, + "reward_std": 0.22059325873851776, + "rewards/final_reward": 0.5475854655181758, + "rewards/mask_iou_reward": 0.2737927327590879, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.35034577548503876, + "rewards/thk_ans_format_reward": 1.0, + "step": 264, + "think_completion_length": 87.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.5625, + "epoch": 0.4468802698145025, + "grad_norm": 5.207261902627235, + "kl": 0.203125, + "learning_rate": 9.106239460370995e-07, + "loss": 0.0002, + "reward": 3.234385371208191, + "reward_std": 0.2574783265590668, + "rewards/final_reward": 1.7540360878854586, + "rewards/mask_iou_reward": 0.8770180439427293, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2343851923942566, + "rewards/thk_ans_format_reward": 1.0, + "step": 265, + "think_completion_length": 97.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.828125, + "epoch": 0.448566610455312, + "grad_norm": 4.894515919476582, + "kl": 0.1962890625, + "learning_rate": 9.102866779089376e-07, + "loss": 0.0002, + "reward": 3.1356217861175537, + "reward_std": 0.3286105990409851, + "rewards/final_reward": 1.07730285055642, + "rewards/mask_iou_reward": 0.53865142527821, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1356218457221985, + "rewards/thk_ans_format_reward": 1.0, + "step": 266, + "think_completion_length": 107.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.28125, + "epoch": 0.4502529510961214, + "grad_norm": 5.429602045870938, + "kl": 0.1943359375, + "learning_rate": 9.099494097807757e-07, + "loss": 0.0002, + "reward": 2.8694194555282593, + "reward_std": 0.41457006335258484, + "rewards/final_reward": 0.9558318417611935, + "rewards/mask_iou_reward": 0.47791592088059676, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.9006692469120026, + "rewards/thk_ans_format_reward": 0.984375, + "step": 267, + "think_completion_length": 101.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.890625, + "epoch": 0.45193929173693087, + "grad_norm": 12.128411701219036, + "kl": 0.22998046875, + "learning_rate": 9.096121416526138e-07, + "loss": 0.0002, + "reward": 2.6065198183059692, + "reward_std": 0.37309861183166504, + "rewards/final_reward": 0.198079350131303, + "rewards/mask_iou_reward": 0.0990396750656515, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6065197587013245, + "rewards/thk_ans_format_reward": 1.0, + "step": 268, + "think_completion_length": 105.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.828125, + "epoch": 0.4536256323777403, + "grad_norm": 5.932760228161688, + "kl": 0.1904296875, + "learning_rate": 9.092748735244519e-07, + "loss": 0.0002, + "reward": 2.5283087491989136, + "reward_std": 0.5122461318969727, + "rewards/final_reward": 0.5003546532870379, + "rewards/mask_iou_reward": 0.25017732664351894, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5283087491989136, + "rewards/thk_ans_format_reward": 1.0, + "step": 269, + "think_completion_length": 109.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.34375, + "epoch": 0.45531197301854975, + "grad_norm": 3.035312901499431, + "kl": 0.20703125, + "learning_rate": 9.0893760539629e-07, + "loss": 0.0002, + "reward": 3.0272092819213867, + "reward_std": 0.6400187015533447, + "rewards/final_reward": 0.7176652425113891, + "rewards/mask_iou_reward": 0.35883262125569454, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0272094011306763, + "rewards/thk_ans_format_reward": 1.0, + "step": 270, + "think_completion_length": 95.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.5625, + "epoch": 0.45699831365935917, + "grad_norm": 3.3097869393690655, + "kl": 0.2314453125, + "learning_rate": 9.086003372681281e-07, + "loss": 0.0002, + "reward": 3.161636710166931, + "reward_std": 0.40839092433452606, + "rewards/final_reward": 0.9878549438773723, + "rewards/mask_iou_reward": 0.49392747193868614, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1616367101669312, + "rewards/thk_ans_format_reward": 1.0, + "step": 271, + "think_completion_length": 106.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.9375, + "epoch": 0.45868465430016864, + "grad_norm": 13.954579861181404, + "kl": 0.228515625, + "learning_rate": 9.082630691399662e-07, + "loss": 0.0002, + "reward": 3.0976139307022095, + "reward_std": 0.33141565322875977, + "rewards/final_reward": 0.9411599724049827, + "rewards/mask_iou_reward": 0.47057998620249136, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.097613900899887, + "rewards/thk_ans_format_reward": 1.0, + "step": 272, + "think_completion_length": 98.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.953125, + "epoch": 0.46037099494097805, + "grad_norm": 6.285702802859748, + "kl": 0.240234375, + "learning_rate": 9.079258010118044e-07, + "loss": 0.0002, + "reward": 2.4280236959457397, + "reward_std": 0.24160349369049072, + "rewards/final_reward": 0.22331540502332667, + "rewards/mask_iou_reward": 0.11165770251166333, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.42802368104457855, + "rewards/thk_ans_format_reward": 1.0, + "step": 273, + "think_completion_length": 109.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.828125, + "epoch": 0.4620573355817875, + "grad_norm": 6.371702634806563, + "kl": 0.228515625, + "learning_rate": 9.075885328836425e-07, + "loss": 0.0002, + "reward": 2.871516704559326, + "reward_std": 0.31683170795440674, + "rewards/final_reward": 1.2616139609735537, + "rewards/mask_iou_reward": 0.6308069804867769, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8715166747570038, + "rewards/thk_ans_format_reward": 1.0, + "step": 274, + "think_completion_length": 91.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.046875, + "epoch": 0.463743676222597, + "grad_norm": 9.870553593778212, + "kl": 0.2265625, + "learning_rate": 9.072512647554806e-07, + "loss": 0.0002, + "reward": 3.31233549118042, + "reward_std": 0.3460022658109665, + "rewards/final_reward": 1.3883268676465192, + "rewards/mask_iou_reward": 0.6941634338232596, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3123353123664856, + "rewards/thk_ans_format_reward": 1.0, + "step": 275, + "think_completion_length": 118.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.4375, + "epoch": 0.4654300168634064, + "grad_norm": 3.4862830215166274, + "kl": 0.2431640625, + "learning_rate": 9.069139966273187e-07, + "loss": 0.0002, + "reward": 2.803930640220642, + "reward_std": 0.25581035390496254, + "rewards/final_reward": 0.8173206091655374, + "rewards/mask_iou_reward": 0.4086603045827687, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8039306998252869, + "rewards/thk_ans_format_reward": 1.0, + "step": 276, + "think_completion_length": 112.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.34375, + "epoch": 0.4671163575042159, + "grad_norm": 3.492125265472315, + "kl": 0.19677734375, + "learning_rate": 9.065767284991568e-07, + "loss": 0.0002, + "reward": 3.049085855484009, + "reward_std": 0.30213601887226105, + "rewards/final_reward": 1.094389741188175, + "rewards/mask_iou_reward": 0.5471948705940874, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0490858852863312, + "rewards/thk_ans_format_reward": 1.0, + "step": 277, + "think_completion_length": 109.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.453125, + "epoch": 0.4688026981450253, + "grad_norm": 4.262931594041564, + "kl": 0.2021484375, + "learning_rate": 9.062394603709948e-07, + "loss": 0.0002, + "reward": 2.6954082250595093, + "reward_std": 0.362802118062973, + "rewards/final_reward": 0.7194369269474945, + "rewards/mask_iou_reward": 0.35971846347374725, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.71103335916996, + "rewards/thk_ans_format_reward": 1.0, + "step": 278, + "think_completion_length": 103.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.453125, + "epoch": 0.47048903878583476, + "grad_norm": 5.514757215166888, + "kl": 0.23876953125, + "learning_rate": 9.05902192242833e-07, + "loss": 0.0002, + "reward": 2.784053087234497, + "reward_std": 0.2925054356455803, + "rewards/final_reward": 1.4111391442364694, + "rewards/mask_iou_reward": 0.7055695721182347, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7840530574321747, + "rewards/thk_ans_format_reward": 1.0, + "step": 279, + "think_completion_length": 128.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.5, + "epoch": 0.47217537942664417, + "grad_norm": 3.6758711061332123, + "kl": 0.208984375, + "learning_rate": 9.055649241146711e-07, + "loss": 0.0002, + "reward": 3.4984025955200195, + "reward_std": 0.3452688194811344, + "rewards/final_reward": 1.6300352304452077, + "rewards/mask_iou_reward": 0.8150176152226039, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.5296525955200195, + "rewards/thk_ans_format_reward": 0.984375, + "step": 280, + "think_completion_length": 95.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 222.890625, + "epoch": 0.47386172006745364, + "grad_norm": 5.7177998821481335, + "kl": 0.2177734375, + "learning_rate": 9.052276559865092e-07, + "loss": 0.0002, + "reward": 2.33639395236969, + "reward_std": 0.38313548266887665, + "rewards/final_reward": 0.46264121102616823, + "rewards/mask_iou_reward": 0.23132060551308412, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 0.39889395236968994, + "rewards/thk_ans_format_reward": 0.96875, + "step": 281, + "think_completion_length": 100.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.34375, + "epoch": 0.47554806070826305, + "grad_norm": 3.138825572604774, + "kl": 0.1982421875, + "learning_rate": 9.048903878583474e-07, + "loss": 0.0002, + "reward": 3.0318186283111572, + "reward_std": 0.33777186274528503, + "rewards/final_reward": 0.937564592938467, + "rewards/mask_iou_reward": 0.4687822964692335, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.0630684792995453, + "rewards/thk_ans_format_reward": 0.984375, + "step": 282, + "think_completion_length": 101.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.109375, + "epoch": 0.4772344013490725, + "grad_norm": 6.630513644178006, + "kl": 0.20751953125, + "learning_rate": 9.045531197301855e-07, + "loss": 0.0002, + "reward": 2.7624622583389282, + "reward_std": 0.2361261248588562, + "rewards/final_reward": 0.7597826504116494, + "rewards/mask_iou_reward": 0.3798913252058247, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7624624371528625, + "rewards/thk_ans_format_reward": 1.0, + "step": 283, + "think_completion_length": 100.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.15625, + "epoch": 0.47892074198988194, + "grad_norm": 11.039690750084555, + "kl": 0.2099609375, + "learning_rate": 9.042158516020235e-07, + "loss": 0.0002, + "reward": 3.314423441886902, + "reward_std": 0.2912362292408943, + "rewards/final_reward": 0.8603434719464518, + "rewards/mask_iou_reward": 0.4301717359732259, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3144232034683228, + "rewards/thk_ans_format_reward": 1.0, + "step": 284, + "think_completion_length": 112.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.921875, + "epoch": 0.4806070826306914, + "grad_norm": 5.349575558185867, + "kl": 0.23828125, + "learning_rate": 9.038785834738617e-07, + "loss": 0.0002, + "reward": 3.142780900001526, + "reward_std": 0.22693616338074207, + "rewards/final_reward": 1.1724235022393383, + "rewards/mask_iou_reward": 0.5862117511196692, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1427810192108154, + "rewards/thk_ans_format_reward": 1.0, + "step": 285, + "think_completion_length": 101.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.921875, + "epoch": 0.4822934232715008, + "grad_norm": 4.78348858573279, + "kl": 0.2314453125, + "learning_rate": 9.035413153456997e-07, + "loss": 0.0002, + "reward": 3.2198691368103027, + "reward_std": 0.30350401997566223, + "rewards/final_reward": 1.092966510959865, + "rewards/mask_iou_reward": 0.5464832554799325, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.219869077205658, + "rewards/thk_ans_format_reward": 1.0, + "step": 286, + "think_completion_length": 111.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.09375, + "epoch": 0.4839797639123103, + "grad_norm": 10.947389225955124, + "kl": 0.2138671875, + "learning_rate": 9.032040472175379e-07, + "loss": 0.0002, + "reward": 2.8175196647644043, + "reward_std": 0.17539205588400364, + "rewards/final_reward": 0.24655763983664442, + "rewards/mask_iou_reward": 0.12327881991832221, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8175197541713715, + "rewards/thk_ans_format_reward": 1.0, + "step": 287, + "think_completion_length": 103.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.890625, + "epoch": 0.4856661045531197, + "grad_norm": 8.330590502308805, + "kl": 0.22314453125, + "learning_rate": 9.02866779089376e-07, + "loss": 0.0002, + "reward": 2.9739983081817627, + "reward_std": 0.48663294315338135, + "rewards/final_reward": 0.9481654836818227, + "rewards/mask_iou_reward": 0.47408274184091137, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9739983677864075, + "rewards/thk_ans_format_reward": 1.0, + "step": 288, + "think_completion_length": 100.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.71875, + "epoch": 0.4873524451939292, + "grad_norm": 14.412230517681351, + "kl": 0.24658203125, + "learning_rate": 9.025295109612141e-07, + "loss": 0.0002, + "reward": 3.198970317840576, + "reward_std": 0.4464504271745682, + "rewards/final_reward": 1.0407472399001998, + "rewards/mask_iou_reward": 0.5203736199500999, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1989702880382538, + "rewards/thk_ans_format_reward": 1.0, + "step": 289, + "think_completion_length": 109.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.125, + "epoch": 0.48903878583473864, + "grad_norm": 5.205770260765118, + "kl": 0.24609375, + "learning_rate": 9.021922428330523e-07, + "loss": 0.0002, + "reward": 3.387374758720398, + "reward_std": 0.2529134303331375, + "rewards/final_reward": 1.6759642648813542, + "rewards/mask_iou_reward": 0.8379821324406771, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3873746991157532, + "rewards/thk_ans_format_reward": 1.0, + "step": 290, + "think_completion_length": 94.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.90625, + "epoch": 0.49072512647554806, + "grad_norm": 3.900650643571578, + "kl": 0.21435546875, + "learning_rate": 9.018549747048904e-07, + "loss": 0.0002, + "reward": 2.4927295446395874, + "reward_std": 0.18346240185201168, + "rewards/final_reward": 0.17499262628138723, + "rewards/mask_iou_reward": 0.08749631314069362, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.4927296042442322, + "rewards/thk_ans_format_reward": 1.0, + "step": 291, + "think_completion_length": 91.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.03125, + "epoch": 0.4924114671163575, + "grad_norm": 7.886194507737518, + "kl": 0.2470703125, + "learning_rate": 9.015177065767285e-07, + "loss": 0.0002, + "reward": 3.286818265914917, + "reward_std": 0.3581873029470444, + "rewards/final_reward": 1.2156858672661348, + "rewards/mask_iou_reward": 0.6078429336330674, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.318068265914917, + "rewards/thk_ans_format_reward": 0.984375, + "step": 292, + "think_completion_length": 125.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.40625, + "epoch": 0.49409780775716694, + "grad_norm": 10.927375058352002, + "kl": 0.20556640625, + "learning_rate": 9.011804384485667e-07, + "loss": 0.0002, + "reward": 3.2041701078414917, + "reward_std": 0.12643220275640488, + "rewards/final_reward": 1.5542415887341692, + "rewards/mask_iou_reward": 0.7771207943670846, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2041699886322021, + "rewards/thk_ans_format_reward": 1.0, + "step": 293, + "think_completion_length": 107.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.546875, + "epoch": 0.4957841483979764, + "grad_norm": 4.79296532813768, + "kl": 0.236328125, + "learning_rate": 9.008431703204047e-07, + "loss": 0.0002, + "reward": 3.6925183534622192, + "reward_std": 0.17621507868170738, + "rewards/final_reward": 1.5201054729840793, + "rewards/mask_iou_reward": 0.7600527364920396, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.692518174648285, + "rewards/thk_ans_format_reward": 1.0, + "step": 294, + "think_completion_length": 80.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.296875, + "epoch": 0.4974704890387858, + "grad_norm": 18.374741866025047, + "kl": 0.1962890625, + "learning_rate": 9.005059021922427e-07, + "loss": 0.0002, + "reward": 3.449096202850342, + "reward_std": 0.09678211063146591, + "rewards/final_reward": 1.7411923961628277, + "rewards/mask_iou_reward": 0.8705961980814139, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4647212624549866, + "rewards/thk_ans_format_reward": 0.984375, + "step": 295, + "think_completion_length": 100.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 239.671875, + "epoch": 0.4991568296795953, + "grad_norm": 4.434564260613753, + "kl": 0.2783203125, + "learning_rate": 9.001686340640809e-07, + "loss": 0.0003, + "reward": 2.545999526977539, + "reward_std": 0.38757744431495667, + "rewards/final_reward": 0.3226123989686819, + "rewards/mask_iou_reward": 0.16130619948434094, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5459995269775391, + "rewards/thk_ans_format_reward": 1.0, + "step": 296, + "think_completion_length": 86.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.484375, + "epoch": 0.5008431703204047, + "grad_norm": 23.58574110222777, + "kl": 0.208984375, + "learning_rate": 8.99831365935919e-07, + "loss": 0.0002, + "reward": 2.6845574378967285, + "reward_std": 0.7191915810108185, + "rewards/final_reward": 1.0216346625140318, + "rewards/mask_iou_reward": 0.5108173312570159, + "rewards/sam_format_reward": 0.90625, + "rewards/sam_reward_func_ultra": 0.7783074378967285, + "rewards/thk_ans_format_reward": 1.0, + "step": 297, + "think_completion_length": 102.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.234375, + "epoch": 0.5025295109612141, + "grad_norm": 9.285836380042333, + "kl": 0.2109375, + "learning_rate": 8.994940978077571e-07, + "loss": 0.0002, + "reward": 3.4048011302948, + "reward_std": 0.15381522569805384, + "rewards/final_reward": 1.265697850525529, + "rewards/mask_iou_reward": 0.6328489252627645, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4048011302947998, + "rewards/thk_ans_format_reward": 1.0, + "step": 298, + "think_completion_length": 92.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.578125, + "epoch": 0.5042158516020236, + "grad_norm": 8.44030956368844, + "kl": 0.19580078125, + "learning_rate": 8.991568296795953e-07, + "loss": 0.0002, + "reward": 3.108208417892456, + "reward_std": 0.45496051013469696, + "rewards/final_reward": 1.314274766823605, + "rewards/mask_iou_reward": 0.6571373834118025, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1082082986831665, + "rewards/thk_ans_format_reward": 1.0, + "step": 299, + "think_completion_length": 95.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.15625, + "epoch": 0.5059021922428331, + "grad_norm": 5.275100464525899, + "kl": 0.20849609375, + "learning_rate": 8.988195615514334e-07, + "loss": 0.0002, + "reward": 3.0727845430374146, + "reward_std": 0.338802233338356, + "rewards/final_reward": 1.0132605632547391, + "rewards/mask_iou_reward": 0.5066302816273696, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.072784423828125, + "rewards/thk_ans_format_reward": 1.0, + "step": 300, + "think_completion_length": 108.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.71875, + "epoch": 0.5075885328836425, + "grad_norm": 8.013159633748892, + "kl": 0.20458984375, + "learning_rate": 8.984822934232715e-07, + "loss": 0.0002, + "reward": 3.0154377222061157, + "reward_std": 0.28923243284225464, + "rewards/final_reward": 0.7514977248736657, + "rewards/mask_iou_reward": 0.37574886243683286, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0154377818107605, + "rewards/thk_ans_format_reward": 1.0, + "step": 301, + "think_completion_length": 88.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.578125, + "epoch": 0.5092748735244519, + "grad_norm": 4.742694603027937, + "kl": 0.22265625, + "learning_rate": 8.981450252951097e-07, + "loss": 0.0002, + "reward": 3.021862030029297, + "reward_std": 0.45061095058918, + "rewards/final_reward": 0.8931014002412393, + "rewards/mask_iou_reward": 0.44655070012061965, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.0531122088432312, + "rewards/thk_ans_format_reward": 0.984375, + "step": 302, + "think_completion_length": 108.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.296875, + "epoch": 0.5109612141652614, + "grad_norm": 8.474267712901908, + "kl": 0.251953125, + "learning_rate": 8.978077571669476e-07, + "loss": 0.0003, + "reward": 2.9662917852401733, + "reward_std": 0.6356634199619293, + "rewards/final_reward": 1.029761918343917, + "rewards/mask_iou_reward": 0.5148809591719585, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9662917256355286, + "rewards/thk_ans_format_reward": 1.0, + "step": 303, + "think_completion_length": 82.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.78125, + "epoch": 0.5126475548060708, + "grad_norm": 7.372693309576553, + "kl": 0.2939453125, + "learning_rate": 8.974704890387857e-07, + "loss": 0.0003, + "reward": 2.326613187789917, + "reward_std": 0.2381967380642891, + "rewards/final_reward": 0.05411758343514183, + "rewards/mask_iou_reward": 0.027058791717570915, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.3266131319105625, + "rewards/thk_ans_format_reward": 1.0, + "step": 304, + "think_completion_length": 86.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.421875, + "epoch": 0.5143338954468802, + "grad_norm": 8.446925811165816, + "kl": 0.224609375, + "learning_rate": 8.971332209106239e-07, + "loss": 0.0002, + "reward": 3.1610687971115112, + "reward_std": 0.26836006343364716, + "rewards/final_reward": 0.9938329226396494, + "rewards/mask_iou_reward": 0.4969164613198247, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1610687971115112, + "rewards/thk_ans_format_reward": 1.0, + "step": 305, + "think_completion_length": 80.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.234375, + "epoch": 0.5160202360876898, + "grad_norm": 8.702016230273058, + "kl": 0.2177734375, + "learning_rate": 8.96795952782462e-07, + "loss": 0.0002, + "reward": 3.011886239051819, + "reward_std": 0.32331761717796326, + "rewards/final_reward": 0.919041138538325, + "rewards/mask_iou_reward": 0.4595205692691625, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0118862390518188, + "rewards/thk_ans_format_reward": 1.0, + "step": 306, + "think_completion_length": 85.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.9375, + "epoch": 0.5177065767284992, + "grad_norm": 3.980012513394504, + "kl": 0.2255859375, + "learning_rate": 8.964586846543001e-07, + "loss": 0.0002, + "reward": 2.9461448192596436, + "reward_std": 0.10024909558705986, + "rewards/final_reward": 1.7532795550322358, + "rewards/mask_iou_reward": 0.8766397775161179, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9461447596549988, + "rewards/thk_ans_format_reward": 1.0, + "step": 307, + "think_completion_length": 75.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.0625, + "epoch": 0.5193929173693086, + "grad_norm": 7.147985372138769, + "kl": 0.2197265625, + "learning_rate": 8.961214165261383e-07, + "loss": 0.0002, + "reward": 2.450806975364685, + "reward_std": 0.22551338374614716, + "rewards/final_reward": 0.2143655313679812, + "rewards/mask_iou_reward": 0.1071827656839906, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.45080701261758804, + "rewards/thk_ans_format_reward": 1.0, + "step": 308, + "think_completion_length": 81.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.421875, + "epoch": 0.521079258010118, + "grad_norm": 5.551548351845276, + "kl": 0.23486328125, + "learning_rate": 8.957841483979764e-07, + "loss": 0.0002, + "reward": 3.691397547721863, + "reward_std": 0.05219288542866707, + "rewards/final_reward": 1.5435201661631972, + "rewards/mask_iou_reward": 0.7717600830815986, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6913974285125732, + "rewards/thk_ans_format_reward": 1.0, + "step": 309, + "think_completion_length": 79.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.203125, + "epoch": 0.5227655986509275, + "grad_norm": 11.529649035295114, + "kl": 0.24609375, + "learning_rate": 8.954468802698145e-07, + "loss": 0.0002, + "reward": 2.901958465576172, + "reward_std": 0.31998536735773087, + "rewards/final_reward": 0.873896867313297, + "rewards/mask_iou_reward": 0.4369484336566485, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9019584059715271, + "rewards/thk_ans_format_reward": 1.0, + "step": 310, + "think_completion_length": 66.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.96875, + "epoch": 0.524451939291737, + "grad_norm": 6.252373978112095, + "kl": 0.25439453125, + "learning_rate": 8.951096121416525e-07, + "loss": 0.0003, + "reward": 2.746753692626953, + "reward_std": 0.17325819842517376, + "rewards/final_reward": 0.7760384249471024, + "rewards/mask_iou_reward": 0.3880192124735512, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7467536926269531, + "rewards/thk_ans_format_reward": 1.0, + "step": 311, + "think_completion_length": 76.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.09375, + "epoch": 0.5261382799325464, + "grad_norm": 17.287381416733663, + "kl": 0.326171875, + "learning_rate": 8.947723440134906e-07, + "loss": 0.0003, + "reward": 3.181082844734192, + "reward_std": 0.42962639033794403, + "rewards/final_reward": 1.0247396851042567, + "rewards/mask_iou_reward": 0.5123698425521284, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.181082844734192, + "rewards/thk_ans_format_reward": 1.0, + "step": 312, + "think_completion_length": 70.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.8125, + "epoch": 0.5278246205733558, + "grad_norm": 4.662803047611527, + "kl": 0.255859375, + "learning_rate": 8.944350758853288e-07, + "loss": 0.0002, + "reward": 3.6371694803237915, + "reward_std": 0.04736559418961406, + "rewards/final_reward": 1.8132385278320307, + "rewards/mask_iou_reward": 0.9066192639160153, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6371695399284363, + "rewards/thk_ans_format_reward": 1.0, + "step": 313, + "think_completion_length": 71.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.453125, + "epoch": 0.5295109612141653, + "grad_norm": 5.2767043392470905, + "kl": 0.2578125, + "learning_rate": 8.940978077571669e-07, + "loss": 0.0003, + "reward": 2.710046172142029, + "reward_std": 0.3843417316675186, + "rewards/final_reward": 0.8192079907416744, + "rewards/mask_iou_reward": 0.4096039953708372, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7100460827350616, + "rewards/thk_ans_format_reward": 1.0, + "step": 314, + "think_completion_length": 65.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.0625, + "epoch": 0.5311973018549747, + "grad_norm": 5.645559638838532, + "kl": 0.2685546875, + "learning_rate": 8.93760539629005e-07, + "loss": 0.0003, + "reward": 2.684740424156189, + "reward_std": 0.3946193754673004, + "rewards/final_reward": 0.541776743554244, + "rewards/mask_iou_reward": 0.270888371777122, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6847403347492218, + "rewards/thk_ans_format_reward": 1.0, + "step": 315, + "think_completion_length": 76.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.53125, + "epoch": 0.5328836424957841, + "grad_norm": 3.2402841409799064, + "kl": 0.25, + "learning_rate": 8.934232715008432e-07, + "loss": 0.0002, + "reward": 3.3814308643341064, + "reward_std": 0.13638974726200104, + "rewards/final_reward": 1.1814053927053858, + "rewards/mask_iou_reward": 0.5907026963526929, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.381430983543396, + "rewards/thk_ans_format_reward": 1.0, + "step": 316, + "think_completion_length": 73.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.453125, + "epoch": 0.5345699831365935, + "grad_norm": 11.639876433776317, + "kl": 0.2314453125, + "learning_rate": 8.930860033726813e-07, + "loss": 0.0002, + "reward": 2.7054232358932495, + "reward_std": 0.3478597477078438, + "rewards/final_reward": 1.0303473977491717, + "rewards/mask_iou_reward": 0.5151736988745859, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7054231911897659, + "rewards/thk_ans_format_reward": 1.0, + "step": 317, + "think_completion_length": 66.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.5, + "epoch": 0.5362563237774031, + "grad_norm": 3.7880326698570377, + "kl": 0.24658203125, + "learning_rate": 8.927487352445194e-07, + "loss": 0.0002, + "reward": 2.7247750759124756, + "reward_std": 0.29200705885887146, + "rewards/final_reward": 0.6541380888774231, + "rewards/mask_iou_reward": 0.32706904443871154, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7247750461101532, + "rewards/thk_ans_format_reward": 1.0, + "step": 318, + "think_completion_length": 64.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.6875, + "epoch": 0.5379426644182125, + "grad_norm": 11.191965779893017, + "kl": 0.23828125, + "learning_rate": 8.924114671163576e-07, + "loss": 0.0002, + "reward": 3.127274513244629, + "reward_std": 0.27082036435604095, + "rewards/final_reward": 0.8115027511302748, + "rewards/mask_iou_reward": 0.4057513755651374, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1272744536399841, + "rewards/thk_ans_format_reward": 1.0, + "step": 319, + "think_completion_length": 58.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.703125, + "epoch": 0.5396290050590219, + "grad_norm": 6.2600404543257895, + "kl": 0.294921875, + "learning_rate": 8.920741989881955e-07, + "loss": 0.0003, + "reward": 3.504320979118347, + "reward_std": 0.2580454498529434, + "rewards/final_reward": 1.7895027190421027, + "rewards/mask_iou_reward": 0.8947513595210513, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.504321038722992, + "rewards/thk_ans_format_reward": 1.0, + "step": 320, + "think_completion_length": 68.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.078125, + "epoch": 0.5413153456998314, + "grad_norm": 4.75179766813063, + "kl": 0.2607421875, + "learning_rate": 8.917369308600336e-07, + "loss": 0.0003, + "reward": 2.915022373199463, + "reward_std": 0.38734908401966095, + "rewards/final_reward": 0.9229503567709705, + "rewards/mask_iou_reward": 0.46147517838548524, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9150223135948181, + "rewards/thk_ans_format_reward": 1.0, + "step": 321, + "think_completion_length": 61.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.59375, + "epoch": 0.5430016863406408, + "grad_norm": 23.226641449249314, + "kl": 0.36572265625, + "learning_rate": 8.913996627318718e-07, + "loss": 0.0004, + "reward": 3.505223870277405, + "reward_std": 0.25062863528728485, + "rewards/final_reward": 1.6520057781269646, + "rewards/mask_iou_reward": 0.8260028890634823, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5052238702774048, + "rewards/thk_ans_format_reward": 1.0, + "step": 322, + "think_completion_length": 70.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.859375, + "epoch": 0.5446880269814502, + "grad_norm": 4.361418731881005, + "kl": 0.404296875, + "learning_rate": 8.910623946037099e-07, + "loss": 0.0004, + "reward": 3.1083903312683105, + "reward_std": 0.23866655677556992, + "rewards/final_reward": 0.7571974737717716, + "rewards/mask_iou_reward": 0.3785987368858858, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1083903908729553, + "rewards/thk_ans_format_reward": 1.0, + "step": 323, + "think_completion_length": 62.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.9375, + "epoch": 0.5463743676222597, + "grad_norm": 4.5546780251605385, + "kl": 0.248046875, + "learning_rate": 8.90725126475548e-07, + "loss": 0.0002, + "reward": 2.8330706357955933, + "reward_std": 0.14445627853274345, + "rewards/final_reward": 0.9145134767110203, + "rewards/mask_iou_reward": 0.4572567383555102, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8330707550048828, + "rewards/thk_ans_format_reward": 1.0, + "step": 324, + "think_completion_length": 62.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.96875, + "epoch": 0.5480607082630692, + "grad_norm": 54.15924549523269, + "kl": 3.556640625, + "learning_rate": 8.903878583473862e-07, + "loss": 0.0036, + "reward": 2.7553576231002808, + "reward_std": 0.31073715165257454, + "rewards/final_reward": 1.0346113208464451, + "rewards/mask_iou_reward": 0.5173056604232226, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7553575932979584, + "rewards/thk_ans_format_reward": 1.0, + "step": 325, + "think_completion_length": 55.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.09375, + "epoch": 0.5497470489038786, + "grad_norm": 72.79025141301403, + "kl": 0.2392578125, + "learning_rate": 8.900505902192243e-07, + "loss": 0.0002, + "reward": 3.7254384756088257, + "reward_std": 0.11009544506669044, + "rewards/final_reward": 1.8056284891474292, + "rewards/mask_iou_reward": 0.9028142445737146, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7254384756088257, + "rewards/thk_ans_format_reward": 1.0, + "step": 326, + "think_completion_length": 65.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.90625, + "epoch": 0.551433389544688, + "grad_norm": 6.158065182698799, + "kl": 0.259765625, + "learning_rate": 8.897133220910623e-07, + "loss": 0.0003, + "reward": 2.8164632320404053, + "reward_std": 0.2505396902561188, + "rewards/final_reward": 0.820963424891961, + "rewards/mask_iou_reward": 0.4104817124459805, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8164632171392441, + "rewards/thk_ans_format_reward": 1.0, + "step": 327, + "think_completion_length": 60.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.71875, + "epoch": 0.5531197301854974, + "grad_norm": 6.008582279919581, + "kl": 0.27001953125, + "learning_rate": 8.893760539629005e-07, + "loss": 0.0003, + "reward": 2.9342983961105347, + "reward_std": 0.48896993696689606, + "rewards/final_reward": 0.9667943819377082, + "rewards/mask_iou_reward": 0.4833971909688541, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.934298574924469, + "rewards/thk_ans_format_reward": 1.0, + "step": 328, + "think_completion_length": 67.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.359375, + "epoch": 0.554806070826307, + "grad_norm": 5.243788107627332, + "kl": 0.27490234375, + "learning_rate": 8.890387858347385e-07, + "loss": 0.0003, + "reward": 3.6061549186706543, + "reward_std": 0.32689087092876434, + "rewards/final_reward": 1.5299962392263828, + "rewards/mask_iou_reward": 0.7649981196131914, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6061549186706543, + "rewards/thk_ans_format_reward": 1.0, + "step": 329, + "think_completion_length": 55.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.75, + "epoch": 0.5564924114671164, + "grad_norm": 4.358233922614152, + "kl": 0.24072265625, + "learning_rate": 8.887015177065766e-07, + "loss": 0.0002, + "reward": 3.3998658657073975, + "reward_std": 0.2283829301595688, + "rewards/final_reward": 1.1769258962723228, + "rewards/mask_iou_reward": 0.5884629481361614, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.399865746498108, + "rewards/thk_ans_format_reward": 1.0, + "step": 330, + "think_completion_length": 61.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.953125, + "epoch": 0.5581787521079258, + "grad_norm": 5.040552564523054, + "kl": 0.2470703125, + "learning_rate": 8.883642495784148e-07, + "loss": 0.0002, + "reward": 2.9218242168426514, + "reward_std": 0.3060344159603119, + "rewards/final_reward": 1.0111649972253083, + "rewards/mask_iou_reward": 0.5055824986126541, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9218244031071663, + "rewards/thk_ans_format_reward": 1.0, + "step": 331, + "think_completion_length": 53.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.921875, + "epoch": 0.5598650927487352, + "grad_norm": 13.687986163382138, + "kl": 0.25439453125, + "learning_rate": 8.880269814502529e-07, + "loss": 0.0003, + "reward": 3.163904905319214, + "reward_std": 0.24482608400285244, + "rewards/final_reward": 1.5012755393722892, + "rewards/mask_iou_reward": 0.7506377696861446, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.179529845714569, + "rewards/thk_ans_format_reward": 0.984375, + "step": 332, + "think_completion_length": 51.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.5, + "epoch": 0.5615514333895447, + "grad_norm": 4.203445201103453, + "kl": 0.25048828125, + "learning_rate": 8.87689713322091e-07, + "loss": 0.0003, + "reward": 2.74581241607666, + "reward_std": 0.26485906541347504, + "rewards/final_reward": 1.1390789726854125, + "rewards/mask_iou_reward": 0.5695394863427062, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7458123862743378, + "rewards/thk_ans_format_reward": 1.0, + "step": 333, + "think_completion_length": 54.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.796875, + "epoch": 0.5632377740303541, + "grad_norm": 4.155544073994526, + "kl": 0.2412109375, + "learning_rate": 8.873524451939292e-07, + "loss": 0.0002, + "reward": 2.7413270473480225, + "reward_std": 0.19857023283839226, + "rewards/final_reward": 0.8799631358931382, + "rewards/mask_iou_reward": 0.4399815679465691, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7413269579410553, + "rewards/thk_ans_format_reward": 1.0, + "step": 334, + "think_completion_length": 62.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.15625, + "epoch": 0.5649241146711635, + "grad_norm": 4.1431683610726875, + "kl": 0.2841796875, + "learning_rate": 8.870151770657673e-07, + "loss": 0.0003, + "reward": 3.1733168363571167, + "reward_std": 0.3433510363101959, + "rewards/final_reward": 0.6148122126354274, + "rewards/mask_iou_reward": 0.3074061063177137, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.173316776752472, + "rewards/thk_ans_format_reward": 1.0, + "step": 335, + "think_completion_length": 48.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.703125, + "epoch": 0.5666104553119731, + "grad_norm": 5.87787727228753, + "kl": 0.26904296875, + "learning_rate": 8.866779089376053e-07, + "loss": 0.0003, + "reward": 2.931514024734497, + "reward_std": 0.17161303758621216, + "rewards/final_reward": 1.1005590800769713, + "rewards/mask_iou_reward": 0.5502795400384857, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9315140843391418, + "rewards/thk_ans_format_reward": 1.0, + "step": 336, + "think_completion_length": 55.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.046875, + "epoch": 0.5682967959527825, + "grad_norm": 4.990989117785591, + "kl": 0.251953125, + "learning_rate": 8.863406408094435e-07, + "loss": 0.0002, + "reward": 3.413249135017395, + "reward_std": 0.1451067440211773, + "rewards/final_reward": 1.8223518536043777, + "rewards/mask_iou_reward": 0.9111759268021888, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4132491946220398, + "rewards/thk_ans_format_reward": 1.0, + "step": 337, + "think_completion_length": 55.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.4375, + "epoch": 0.5699831365935919, + "grad_norm": 27.3955634919466, + "kl": 0.369140625, + "learning_rate": 8.860033726812815e-07, + "loss": 0.0004, + "reward": 3.265120506286621, + "reward_std": 0.2976267971098423, + "rewards/final_reward": 1.0012632907001968, + "rewards/mask_iou_reward": 0.5006316453500984, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2807455658912659, + "rewards/thk_ans_format_reward": 0.984375, + "step": 338, + "think_completion_length": 58.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.0, + "epoch": 0.5716694772344013, + "grad_norm": 5.072182504253793, + "kl": 0.263671875, + "learning_rate": 8.856661045531197e-07, + "loss": 0.0003, + "reward": 3.1486185789108276, + "reward_std": 0.3894375190138817, + "rewards/final_reward": 1.375238376644695, + "rewards/mask_iou_reward": 0.6876191883223475, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1486185491085052, + "rewards/thk_ans_format_reward": 1.0, + "step": 339, + "think_completion_length": 51.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.796875, + "epoch": 0.5733558178752108, + "grad_norm": 4.334373316071912, + "kl": 0.263671875, + "learning_rate": 8.853288364249578e-07, + "loss": 0.0003, + "reward": 3.1587361097335815, + "reward_std": 0.3358212560415268, + "rewards/final_reward": 1.353470388714372, + "rewards/mask_iou_reward": 0.676735194357186, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1587361097335815, + "rewards/thk_ans_format_reward": 1.0, + "step": 340, + "think_completion_length": 60.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.046875, + "epoch": 0.5750421585160203, + "grad_norm": 5.806740334340846, + "kl": 0.2587890625, + "learning_rate": 8.849915682967959e-07, + "loss": 0.0003, + "reward": 3.818936824798584, + "reward_std": 0.04810533579438925, + "rewards/final_reward": 1.8369189113220132, + "rewards/mask_iou_reward": 0.9184594556610066, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.818936824798584, + "rewards/thk_ans_format_reward": 1.0, + "step": 341, + "think_completion_length": 66.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.71875, + "epoch": 0.5767284991568297, + "grad_norm": 6.058374485035506, + "kl": 0.4013671875, + "learning_rate": 8.846543001686341e-07, + "loss": 0.0004, + "reward": 3.5225307941436768, + "reward_std": 0.37527384608983994, + "rewards/final_reward": 1.4858986672894283, + "rewards/mask_iou_reward": 0.7429493336447142, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.522530734539032, + "rewards/thk_ans_format_reward": 1.0, + "step": 342, + "think_completion_length": 61.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.640625, + "epoch": 0.5784148397976391, + "grad_norm": 3.3695996161749124, + "kl": 0.23681640625, + "learning_rate": 8.843170320404722e-07, + "loss": 0.0002, + "reward": 3.1007397174835205, + "reward_std": 0.12492630630731583, + "rewards/final_reward": 1.1520376838351214, + "rewards/mask_iou_reward": 0.5760188419175607, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.100739747285843, + "rewards/thk_ans_format_reward": 1.0, + "step": 343, + "think_completion_length": 54.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.203125, + "epoch": 0.5801011804384486, + "grad_norm": 4.939004747976328, + "kl": 0.28857421875, + "learning_rate": 8.839797639123102e-07, + "loss": 0.0003, + "reward": 3.2634146213531494, + "reward_std": 0.3214031755924225, + "rewards/final_reward": 0.9727509594520076, + "rewards/mask_iou_reward": 0.4863754797260038, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2634146213531494, + "rewards/thk_ans_format_reward": 1.0, + "step": 344, + "think_completion_length": 50.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.546875, + "epoch": 0.581787521079258, + "grad_norm": 7.867013326293164, + "kl": 0.25634765625, + "learning_rate": 8.836424957841484e-07, + "loss": 0.0003, + "reward": 2.949946165084839, + "reward_std": 0.45425738394260406, + "rewards/final_reward": 0.9740685863298226, + "rewards/mask_iou_reward": 0.4870342931649113, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.9655711352825165, + "rewards/thk_ans_format_reward": 1.0, + "step": 345, + "think_completion_length": 60.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.875, + "epoch": 0.5834738617200674, + "grad_norm": 3.5260809492982257, + "kl": 0.3115234375, + "learning_rate": 8.833052276559864e-07, + "loss": 0.0003, + "reward": 2.9657833576202393, + "reward_std": 0.3081818874925375, + "rewards/final_reward": 1.1107575452706013, + "rewards/mask_iou_reward": 0.5553787726353007, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.9814085066318512, + "rewards/thk_ans_format_reward": 1.0, + "step": 346, + "think_completion_length": 62.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.4375, + "epoch": 0.5851602023608768, + "grad_norm": 5.035821733992209, + "kl": 0.25048828125, + "learning_rate": 8.829679595278245e-07, + "loss": 0.0003, + "reward": 2.996280312538147, + "reward_std": 0.13460349664092064, + "rewards/final_reward": 1.308326504855776, + "rewards/mask_iou_reward": 0.654163252427888, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9962801933288574, + "rewards/thk_ans_format_reward": 1.0, + "step": 347, + "think_completion_length": 63.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.78125, + "epoch": 0.5868465430016864, + "grad_norm": 27.576882428955795, + "kl": 0.2685546875, + "learning_rate": 8.826306913996627e-07, + "loss": 0.0003, + "reward": 2.9972203969955444, + "reward_std": 0.2722517102956772, + "rewards/final_reward": 0.8487660003192941, + "rewards/mask_iou_reward": 0.42438300015964703, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9972204267978668, + "rewards/thk_ans_format_reward": 1.0, + "step": 348, + "think_completion_length": 68.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.65625, + "epoch": 0.5885328836424958, + "grad_norm": 4.389319652797556, + "kl": 0.24609375, + "learning_rate": 8.822934232715008e-07, + "loss": 0.0002, + "reward": 2.865875244140625, + "reward_std": 0.13240397721529007, + "rewards/final_reward": 0.9376044149712613, + "rewards/mask_iou_reward": 0.4688022074856307, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8658752739429474, + "rewards/thk_ans_format_reward": 1.0, + "step": 349, + "think_completion_length": 59.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.625, + "epoch": 0.5902192242833052, + "grad_norm": 5.101680634393306, + "kl": 0.43115234375, + "learning_rate": 8.819561551433389e-07, + "loss": 0.0004, + "reward": 3.332337260246277, + "reward_std": 0.3843718320131302, + "rewards/final_reward": 1.221562650525076, + "rewards/mask_iou_reward": 0.610781325262538, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3323372602462769, + "rewards/thk_ans_format_reward": 1.0, + "step": 350, + "think_completion_length": 56.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.765625, + "epoch": 0.5919055649241147, + "grad_norm": 6.998265450002808, + "kl": 0.25390625, + "learning_rate": 8.816188870151771e-07, + "loss": 0.0003, + "reward": 2.870605945587158, + "reward_std": 0.2439437434077263, + "rewards/final_reward": 0.754005258460424, + "rewards/mask_iou_reward": 0.377002629230212, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8706059455871582, + "rewards/thk_ans_format_reward": 1.0, + "step": 351, + "think_completion_length": 57.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.46875, + "epoch": 0.5935919055649241, + "grad_norm": 6.202091538730412, + "kl": 0.251953125, + "learning_rate": 8.812816188870151e-07, + "loss": 0.0003, + "reward": 3.2305729389190674, + "reward_std": 0.4073144942522049, + "rewards/final_reward": 1.0945567016798163, + "rewards/mask_iou_reward": 0.5472783508399082, + "rewards/sam_format_reward": 0.953125, + "rewards/sam_reward_func_ultra": 1.2774479985237122, + "rewards/thk_ans_format_reward": 1.0, + "step": 352, + "think_completion_length": 63.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.5, + "epoch": 0.5952782462057336, + "grad_norm": 5.579559586465592, + "kl": 0.2841796875, + "learning_rate": 8.809443507588532e-07, + "loss": 0.0003, + "reward": 3.342374801635742, + "reward_std": 0.39917896687984467, + "rewards/final_reward": 1.1869944150046479, + "rewards/mask_iou_reward": 0.5934972075023239, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3423748016357422, + "rewards/thk_ans_format_reward": 1.0, + "step": 353, + "think_completion_length": 60.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.421875, + "epoch": 0.596964586846543, + "grad_norm": 5.020379721979836, + "kl": 0.275390625, + "learning_rate": 8.806070826306914e-07, + "loss": 0.0003, + "reward": 3.1944527626037598, + "reward_std": 0.2604561969637871, + "rewards/final_reward": 1.854011010006161, + "rewards/mask_iou_reward": 0.9270055050030805, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.194452702999115, + "rewards/thk_ans_format_reward": 1.0, + "step": 354, + "think_completion_length": 60.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.890625, + "epoch": 0.5986509274873525, + "grad_norm": 13.02489612056237, + "kl": 0.29736328125, + "learning_rate": 8.802698145025294e-07, + "loss": 0.0003, + "reward": 2.82450008392334, + "reward_std": 0.2679259032011032, + "rewards/final_reward": 0.8881105820850812, + "rewards/mask_iou_reward": 0.4440552910425406, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8245000541210175, + "rewards/thk_ans_format_reward": 1.0, + "step": 355, + "think_completion_length": 69.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.59375, + "epoch": 0.6003372681281619, + "grad_norm": 23.063711576098893, + "kl": 0.232421875, + "learning_rate": 8.799325463743675e-07, + "loss": 0.0002, + "reward": 2.9744956493377686, + "reward_std": 0.2751796506345272, + "rewards/final_reward": 1.4443419340676278, + "rewards/mask_iou_reward": 0.7221709670338139, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.9901207089424133, + "rewards/thk_ans_format_reward": 1.0, + "step": 356, + "think_completion_length": 64.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.953125, + "epoch": 0.6020236087689713, + "grad_norm": 8.104848340003967, + "kl": 0.2822265625, + "learning_rate": 8.795952782462057e-07, + "loss": 0.0003, + "reward": 2.4996743202209473, + "reward_std": 0.39900892972946167, + "rewards/final_reward": 0.0625, + "rewards/mask_iou_reward": 0.03125, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 0.5465493500232697, + "rewards/thk_ans_format_reward": 0.984375, + "step": 357, + "think_completion_length": 56.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.0625, + "epoch": 0.6037099494097807, + "grad_norm": 4.853883488792399, + "kl": 0.2734375, + "learning_rate": 8.792580101180438e-07, + "loss": 0.0003, + "reward": 3.561056137084961, + "reward_std": 0.40073344111442566, + "rewards/final_reward": 1.6063573685327528, + "rewards/mask_iou_reward": 0.8031786842663764, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5610561966896057, + "rewards/thk_ans_format_reward": 1.0, + "step": 358, + "think_completion_length": 56.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.859375, + "epoch": 0.6053962900505903, + "grad_norm": 5.976143731168656, + "kl": 0.2568359375, + "learning_rate": 8.789207419898819e-07, + "loss": 0.0003, + "reward": 3.239910840988159, + "reward_std": 0.3226969689130783, + "rewards/final_reward": 1.1236291825562077, + "rewards/mask_iou_reward": 0.5618145912781038, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2399107217788696, + "rewards/thk_ans_format_reward": 1.0, + "step": 359, + "think_completion_length": 68.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.578125, + "epoch": 0.6070826306913997, + "grad_norm": 5.128737900496672, + "kl": 0.263671875, + "learning_rate": 8.785834738617201e-07, + "loss": 0.0003, + "reward": 2.7771421670913696, + "reward_std": 0.5488343238830566, + "rewards/final_reward": 0.6804320378228413, + "rewards/mask_iou_reward": 0.34021601891142067, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7771420478820801, + "rewards/thk_ans_format_reward": 1.0, + "step": 360, + "think_completion_length": 61.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.125, + "epoch": 0.6087689713322091, + "grad_norm": 4.894849899866179, + "kl": 0.2734375, + "learning_rate": 8.782462057335581e-07, + "loss": 0.0003, + "reward": 3.0549187660217285, + "reward_std": 0.17568432539701462, + "rewards/final_reward": 1.432484962181782, + "rewards/mask_iou_reward": 0.716242481090891, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0549187511205673, + "rewards/thk_ans_format_reward": 1.0, + "step": 361, + "think_completion_length": 59.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.875, + "epoch": 0.6104553119730185, + "grad_norm": 5.639323127623386, + "kl": 0.3330078125, + "learning_rate": 8.779089376053963e-07, + "loss": 0.0003, + "reward": 3.342157244682312, + "reward_std": 0.3479279577732086, + "rewards/final_reward": 1.4514362461598367, + "rewards/mask_iou_reward": 0.7257181230799183, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3421571850776672, + "rewards/thk_ans_format_reward": 1.0, + "step": 362, + "think_completion_length": 67.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.59375, + "epoch": 0.612141652613828, + "grad_norm": 9.40098674825858, + "kl": 0.3349609375, + "learning_rate": 8.775716694772344e-07, + "loss": 0.0003, + "reward": 3.223899245262146, + "reward_std": 0.177157923579216, + "rewards/final_reward": 1.1118048766189446, + "rewards/mask_iou_reward": 0.5559024383094723, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2238992750644684, + "rewards/thk_ans_format_reward": 1.0, + "step": 363, + "think_completion_length": 64.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.15625, + "epoch": 0.6138279932546374, + "grad_norm": 59.3341930780702, + "kl": 0.22998046875, + "learning_rate": 8.772344013490724e-07, + "loss": 0.0002, + "reward": 3.087131977081299, + "reward_std": 0.4654431641101837, + "rewards/final_reward": 1.21102136613791, + "rewards/mask_iou_reward": 0.605510683068955, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.087131917476654, + "rewards/thk_ans_format_reward": 1.0, + "step": 364, + "think_completion_length": 64.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.640625, + "epoch": 0.6155143338954469, + "grad_norm": 5.181103170179545, + "kl": 0.3125, + "learning_rate": 8.768971332209106e-07, + "loss": 0.0003, + "reward": 3.7247079610824585, + "reward_std": 0.22579550743103027, + "rewards/final_reward": 1.766907703986788, + "rewards/mask_iou_reward": 0.883453851993394, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.7403329610824585, + "rewards/thk_ans_format_reward": 1.0, + "step": 365, + "think_completion_length": 61.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.9375, + "epoch": 0.6172006745362564, + "grad_norm": 7.005506467477656, + "kl": 0.2734375, + "learning_rate": 8.765598650927487e-07, + "loss": 0.0003, + "reward": 3.311755895614624, + "reward_std": 0.09347007237374783, + "rewards/final_reward": 0.9238924283518867, + "rewards/mask_iou_reward": 0.46194621417594334, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3117559254169464, + "rewards/thk_ans_format_reward": 1.0, + "step": 366, + "think_completion_length": 64.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.109375, + "epoch": 0.6188870151770658, + "grad_norm": 26.431305097403335, + "kl": 0.2783203125, + "learning_rate": 8.762225969645868e-07, + "loss": 0.0003, + "reward": 3.2874670028686523, + "reward_std": 0.3315463215112686, + "rewards/final_reward": 0.8942609141714675, + "rewards/mask_iou_reward": 0.44713045708573373, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2874669432640076, + "rewards/thk_ans_format_reward": 1.0, + "step": 367, + "think_completion_length": 73.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.328125, + "epoch": 0.6205733558178752, + "grad_norm": 22.45281866642402, + "kl": 0.384765625, + "learning_rate": 8.75885328836425e-07, + "loss": 0.0004, + "reward": 3.3114068508148193, + "reward_std": 0.14964250102639198, + "rewards/final_reward": 1.2569832236958134, + "rewards/mask_iou_reward": 0.6284916118479067, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3114069104194641, + "rewards/thk_ans_format_reward": 1.0, + "step": 368, + "think_completion_length": 57.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.578125, + "epoch": 0.6222596964586846, + "grad_norm": 5.267594297758696, + "kl": 0.287109375, + "learning_rate": 8.75548060708263e-07, + "loss": 0.0003, + "reward": 3.296778082847595, + "reward_std": 0.28996123373508453, + "rewards/final_reward": 1.7162430048361945, + "rewards/mask_iou_reward": 0.8581215024180973, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2967780828475952, + "rewards/thk_ans_format_reward": 1.0, + "step": 369, + "think_completion_length": 68.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.734375, + "epoch": 0.6239460370994941, + "grad_norm": 32.26698855157922, + "kl": 0.25244140625, + "learning_rate": 8.752107925801011e-07, + "loss": 0.0002, + "reward": 3.587504506111145, + "reward_std": 0.20235136337578297, + "rewards/final_reward": 1.6433316468966788, + "rewards/mask_iou_reward": 0.8216658234483394, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.587504506111145, + "rewards/thk_ans_format_reward": 1.0, + "step": 370, + "think_completion_length": 56.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.453125, + "epoch": 0.6256323777403036, + "grad_norm": 36.90675958939858, + "kl": 0.25830078125, + "learning_rate": 8.748735244519393e-07, + "loss": 0.0003, + "reward": 3.4644440412521362, + "reward_std": 0.14919602870941162, + "rewards/final_reward": 1.8359068655427828, + "rewards/mask_iou_reward": 0.9179534327713914, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4644440412521362, + "rewards/thk_ans_format_reward": 1.0, + "step": 371, + "think_completion_length": 63.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.5625, + "epoch": 0.627318718381113, + "grad_norm": 3.977481472662859, + "kl": 0.2880859375, + "learning_rate": 8.745362563237774e-07, + "loss": 0.0003, + "reward": 3.485430121421814, + "reward_std": 0.08592750132083893, + "rewards/final_reward": 1.8542197972391161, + "rewards/mask_iou_reward": 0.9271098986195581, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.485430121421814, + "rewards/thk_ans_format_reward": 1.0, + "step": 372, + "think_completion_length": 64.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.0, + "epoch": 0.6290050590219224, + "grad_norm": 11.672200901434106, + "kl": 0.23095703125, + "learning_rate": 8.741989881956154e-07, + "loss": 0.0002, + "reward": 3.1509275436401367, + "reward_std": 0.3214876800775528, + "rewards/final_reward": 1.6934902014125646, + "rewards/mask_iou_reward": 0.8467451007062823, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1509276628494263, + "rewards/thk_ans_format_reward": 1.0, + "step": 373, + "think_completion_length": 69.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.75, + "epoch": 0.6306913996627319, + "grad_norm": 6.7196830138170345, + "kl": 0.2705078125, + "learning_rate": 8.738617200674536e-07, + "loss": 0.0003, + "reward": 3.345046043395996, + "reward_std": 0.24192753434181213, + "rewards/final_reward": 1.1307378696874646, + "rewards/mask_iou_reward": 0.5653689348437323, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3450458347797394, + "rewards/thk_ans_format_reward": 1.0, + "step": 374, + "think_completion_length": 64.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.90625, + "epoch": 0.6323777403035413, + "grad_norm": 6.715116261073944, + "kl": 0.2353515625, + "learning_rate": 8.735244519392917e-07, + "loss": 0.0002, + "reward": 2.55766224861145, + "reward_std": 0.07771342247724533, + "rewards/final_reward": 0.9295640305047778, + "rewards/mask_iou_reward": 0.4647820152523889, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5576622486114502, + "rewards/thk_ans_format_reward": 1.0, + "step": 375, + "think_completion_length": 59.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.671875, + "epoch": 0.6340640809443507, + "grad_norm": 5.350210243493229, + "kl": 0.25146484375, + "learning_rate": 8.731871838111298e-07, + "loss": 0.0003, + "reward": 3.4477986097335815, + "reward_std": 0.20711440779268742, + "rewards/final_reward": 1.2917363954773742, + "rewards/mask_iou_reward": 0.6458681977386871, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4477986097335815, + "rewards/thk_ans_format_reward": 1.0, + "step": 376, + "think_completion_length": 59.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.75, + "epoch": 0.6357504215851602, + "grad_norm": 13.855411935365716, + "kl": 0.263671875, + "learning_rate": 8.728499156829679e-07, + "loss": 0.0003, + "reward": 3.2779818773269653, + "reward_std": 0.5463913530111313, + "rewards/final_reward": 0.9221951530177666, + "rewards/mask_iou_reward": 0.4610975765088833, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2779819965362549, + "rewards/thk_ans_format_reward": 1.0, + "step": 377, + "think_completion_length": 75.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.28125, + "epoch": 0.6374367622259697, + "grad_norm": 15.4353365340289, + "kl": 0.28125, + "learning_rate": 8.72512647554806e-07, + "loss": 0.0003, + "reward": 2.8729852437973022, + "reward_std": 0.11946488171815872, + "rewards/final_reward": 1.374176703995387, + "rewards/mask_iou_reward": 0.6870883519976935, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.872985303401947, + "rewards/thk_ans_format_reward": 1.0, + "step": 378, + "think_completion_length": 60.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.5, + "epoch": 0.6391231028667791, + "grad_norm": 6.330254599055379, + "kl": 0.2646484375, + "learning_rate": 8.721753794266441e-07, + "loss": 0.0003, + "reward": 3.4926542043685913, + "reward_std": 0.360406506806612, + "rewards/final_reward": 1.8171903327357386, + "rewards/mask_iou_reward": 0.9085951663678693, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4926542043685913, + "rewards/thk_ans_format_reward": 1.0, + "step": 379, + "think_completion_length": 58.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.625, + "epoch": 0.6408094435075885, + "grad_norm": 8.281536520621378, + "kl": 0.359375, + "learning_rate": 8.718381112984823e-07, + "loss": 0.0004, + "reward": 2.885327696800232, + "reward_std": 0.38757482171058655, + "rewards/final_reward": 1.036380061901672, + "rewards/mask_iou_reward": 0.518190030950836, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9009527862071991, + "rewards/thk_ans_format_reward": 0.984375, + "step": 380, + "think_completion_length": 53.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.78125, + "epoch": 0.642495784148398, + "grad_norm": 4.035395909526506, + "kl": 0.2958984375, + "learning_rate": 8.715008431703204e-07, + "loss": 0.0003, + "reward": 2.560006260871887, + "reward_std": 0.23123303055763245, + "rewards/final_reward": 0.8636089485805896, + "rewards/mask_iou_reward": 0.4318044742902948, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.575631245970726, + "rewards/thk_ans_format_reward": 0.984375, + "step": 381, + "think_completion_length": 71.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.953125, + "epoch": 0.6441821247892074, + "grad_norm": 10.783252150736887, + "kl": 0.2900390625, + "learning_rate": 8.711635750421584e-07, + "loss": 0.0003, + "reward": 3.132146716117859, + "reward_std": 0.28874067962169647, + "rewards/final_reward": 0.9155655076835651, + "rewards/mask_iou_reward": 0.4577827538417826, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1321466565132141, + "rewards/thk_ans_format_reward": 1.0, + "step": 382, + "think_completion_length": 62.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.703125, + "epoch": 0.6458684654300169, + "grad_norm": 11.402565008509699, + "kl": 0.263671875, + "learning_rate": 8.708263069139966e-07, + "loss": 0.0003, + "reward": 3.3465912342071533, + "reward_std": 0.4565277546644211, + "rewards/final_reward": 1.5629399880976687, + "rewards/mask_iou_reward": 0.7814699940488343, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3465911149978638, + "rewards/thk_ans_format_reward": 1.0, + "step": 383, + "think_completion_length": 57.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.375, + "epoch": 0.6475548060708263, + "grad_norm": 4.795135202730111, + "kl": 0.3251953125, + "learning_rate": 8.704890387858347e-07, + "loss": 0.0003, + "reward": 3.0373746156692505, + "reward_std": 0.33877818286418915, + "rewards/final_reward": 1.425423519540372, + "rewards/mask_iou_reward": 0.712711759770186, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0373746454715729, + "rewards/thk_ans_format_reward": 1.0, + "step": 384, + "think_completion_length": 56.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.953125, + "epoch": 0.6492411467116358, + "grad_norm": 7.6054679003491, + "kl": 0.2783203125, + "learning_rate": 8.701517706576727e-07, + "loss": 0.0003, + "reward": 3.0235763788223267, + "reward_std": 0.34590520709753036, + "rewards/final_reward": 0.6524966898611744, + "rewards/mask_iou_reward": 0.3262483449305872, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0235763490200043, + "rewards/thk_ans_format_reward": 1.0, + "step": 385, + "think_completion_length": 61.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.375, + "epoch": 0.6509274873524452, + "grad_norm": 7.642619901632982, + "kl": 0.26171875, + "learning_rate": 8.698145025295109e-07, + "loss": 0.0003, + "reward": 3.2443251609802246, + "reward_std": 0.15900463983416557, + "rewards/final_reward": 1.1955481569676252, + "rewards/mask_iou_reward": 0.5977740784838126, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.244325041770935, + "rewards/thk_ans_format_reward": 1.0, + "step": 386, + "think_completion_length": 63.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.359375, + "epoch": 0.6526138279932546, + "grad_norm": 5.098340106768318, + "kl": 0.3203125, + "learning_rate": 8.69477234401349e-07, + "loss": 0.0006, + "reward": 2.9133780002593994, + "reward_std": 0.10066283494234085, + "rewards/final_reward": 0.13628261847652232, + "rewards/mask_iou_reward": 0.06814130923826116, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9133780598640442, + "rewards/thk_ans_format_reward": 1.0, + "step": 387, + "think_completion_length": 59.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.25, + "epoch": 0.654300168634064, + "grad_norm": 4.621168421511462, + "kl": 0.38671875, + "learning_rate": 8.691399662731872e-07, + "loss": 0.0004, + "reward": 3.448104739189148, + "reward_std": 0.46191219985485077, + "rewards/final_reward": 1.5218526491931632, + "rewards/mask_iou_reward": 0.7609263245965816, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4481046795845032, + "rewards/thk_ans_format_reward": 1.0, + "step": 388, + "think_completion_length": 56.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.359375, + "epoch": 0.6559865092748736, + "grad_norm": 5.379842188826864, + "kl": 0.421875, + "learning_rate": 8.688026981450253e-07, + "loss": 0.0004, + "reward": 3.3314003944396973, + "reward_std": 0.1324574500322342, + "rewards/final_reward": 1.4115282221098315, + "rewards/mask_iou_reward": 0.7057641110549158, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3314002752304077, + "rewards/thk_ans_format_reward": 1.0, + "step": 389, + "think_completion_length": 65.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.984375, + "epoch": 0.657672849915683, + "grad_norm": 8.721876673645482, + "kl": 0.3671875, + "learning_rate": 8.684654300168634e-07, + "loss": 0.0004, + "reward": 3.0254807472229004, + "reward_std": 0.3890673518180847, + "rewards/final_reward": 0.6676522325720965, + "rewards/mask_iou_reward": 0.33382611628604825, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0254807472229004, + "rewards/thk_ans_format_reward": 1.0, + "step": 390, + "think_completion_length": 72.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.40625, + "epoch": 0.6593591905564924, + "grad_norm": 6.60492896849073, + "kl": 0.2958984375, + "learning_rate": 8.681281618887015e-07, + "loss": 0.0003, + "reward": 3.555801510810852, + "reward_std": 0.3658871501684189, + "rewards/final_reward": 1.5090375076129487, + "rewards/mask_iou_reward": 0.7545187538064744, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5558015704154968, + "rewards/thk_ans_format_reward": 1.0, + "step": 391, + "think_completion_length": 56.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.28125, + "epoch": 0.6610455311973018, + "grad_norm": 11.714815873295858, + "kl": 0.271484375, + "learning_rate": 8.677908937605396e-07, + "loss": 0.0003, + "reward": 2.6682028770446777, + "reward_std": 0.12304145842790604, + "rewards/final_reward": 0.7471882303933259, + "rewards/mask_iou_reward": 0.37359411519666297, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.668202817440033, + "rewards/thk_ans_format_reward": 1.0, + "step": 392, + "think_completion_length": 71.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.375, + "epoch": 0.6627318718381113, + "grad_norm": 6.683431729410963, + "kl": 0.2978515625, + "learning_rate": 8.674536256323777e-07, + "loss": 0.0003, + "reward": 3.397181987762451, + "reward_std": 0.27734769880771637, + "rewards/final_reward": 1.081036873280966, + "rewards/mask_iou_reward": 0.540518436640483, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3971819579601288, + "rewards/thk_ans_format_reward": 1.0, + "step": 393, + "think_completion_length": 66.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.109375, + "epoch": 0.6644182124789207, + "grad_norm": 12.546590991266813, + "kl": 0.296875, + "learning_rate": 8.671163575042158e-07, + "loss": 0.0003, + "reward": 2.7042452096939087, + "reward_std": 0.265919953584671, + "rewards/final_reward": 0.6445974072533363, + "rewards/mask_iou_reward": 0.3222987036266681, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7042451202869415, + "rewards/thk_ans_format_reward": 1.0, + "step": 394, + "think_completion_length": 74.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.515625, + "epoch": 0.6661045531197302, + "grad_norm": 9.094193873759082, + "kl": 0.23046875, + "learning_rate": 8.667790893760539e-07, + "loss": 0.0002, + "reward": 3.18087375164032, + "reward_std": 0.10219046473503113, + "rewards/final_reward": 1.65937009926742, + "rewards/mask_iou_reward": 0.82968504963371, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1808739304542542, + "rewards/thk_ans_format_reward": 1.0, + "step": 395, + "think_completion_length": 65.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.03125, + "epoch": 0.6677908937605397, + "grad_norm": 8.923928936464332, + "kl": 0.306640625, + "learning_rate": 8.66441821247892e-07, + "loss": 0.0003, + "reward": 3.333390235900879, + "reward_std": 0.10420708172023296, + "rewards/final_reward": 1.1318658920698947, + "rewards/mask_iou_reward": 0.5659329460349474, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3333901762962341, + "rewards/thk_ans_format_reward": 1.0, + "step": 396, + "think_completion_length": 63.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.015625, + "epoch": 0.6694772344013491, + "grad_norm": 10.855733134185414, + "kl": 0.28466796875, + "learning_rate": 8.661045531197302e-07, + "loss": 0.0003, + "reward": 2.7989786863327026, + "reward_std": 0.48467782139778137, + "rewards/final_reward": 0.7798602489754165, + "rewards/mask_iou_reward": 0.38993012448770825, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7989786863327026, + "rewards/thk_ans_format_reward": 1.0, + "step": 397, + "think_completion_length": 72.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.703125, + "epoch": 0.6711635750421585, + "grad_norm": 8.498372368415978, + "kl": 0.2392578125, + "learning_rate": 8.657672849915683e-07, + "loss": 0.0002, + "reward": 3.2104252576828003, + "reward_std": 0.3338836580514908, + "rewards/final_reward": 1.384165478226141, + "rewards/mask_iou_reward": 0.6920827391130705, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2104252576828003, + "rewards/thk_ans_format_reward": 1.0, + "step": 398, + "think_completion_length": 55.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.15625, + "epoch": 0.6728499156829679, + "grad_norm": 4.168354530926599, + "kl": 0.3369140625, + "learning_rate": 8.654300168634064e-07, + "loss": 0.0003, + "reward": 3.360267162322998, + "reward_std": 0.27631398290395737, + "rewards/final_reward": 1.3226066492302255, + "rewards/mask_iou_reward": 0.6613033246151128, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.360267162322998, + "rewards/thk_ans_format_reward": 1.0, + "step": 399, + "think_completion_length": 73.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.71875, + "epoch": 0.6745362563237775, + "grad_norm": 6.109202011063787, + "kl": 0.2451171875, + "learning_rate": 8.650927487352445e-07, + "loss": 0.0002, + "reward": 2.7300503253936768, + "reward_std": 0.3722390979528427, + "rewards/final_reward": 1.022374351321346, + "rewards/mask_iou_reward": 0.511187175660673, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7300503700971603, + "rewards/thk_ans_format_reward": 1.0, + "step": 400, + "think_completion_length": 70.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.3125, + "epoch": 0.6762225969645869, + "grad_norm": 4.023558836956739, + "kl": 0.2392578125, + "learning_rate": 8.647554806070826e-07, + "loss": 0.0002, + "reward": 2.928010106086731, + "reward_std": 0.26283153891563416, + "rewards/final_reward": 1.1748140838127776, + "rewards/mask_iou_reward": 0.5874070419063888, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.928010106086731, + "rewards/thk_ans_format_reward": 1.0, + "step": 401, + "think_completion_length": 65.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.984375, + "epoch": 0.6779089376053963, + "grad_norm": 3.6277340907732945, + "kl": 0.2958984375, + "learning_rate": 8.644182124789206e-07, + "loss": 0.0003, + "reward": 2.82840895652771, + "reward_std": 0.18483292683959007, + "rewards/final_reward": 0.09071974080914268, + "rewards/mask_iou_reward": 0.04535987040457134, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8284089267253876, + "rewards/thk_ans_format_reward": 1.0, + "step": 402, + "think_completion_length": 66.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.34375, + "epoch": 0.6795952782462057, + "grad_norm": 5.293796217517415, + "kl": 0.26904296875, + "learning_rate": 8.640809443507588e-07, + "loss": 0.0003, + "reward": 3.4878629446029663, + "reward_std": 0.1448333915323019, + "rewards/final_reward": 1.632881523088085, + "rewards/mask_iou_reward": 0.8164407615440425, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4878630638122559, + "rewards/thk_ans_format_reward": 1.0, + "step": 403, + "think_completion_length": 58.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.90625, + "epoch": 0.6812816188870152, + "grad_norm": 4.0214023431676615, + "kl": 0.298828125, + "learning_rate": 8.637436762225969e-07, + "loss": 0.0003, + "reward": 3.0970970392227173, + "reward_std": 0.24998241756111383, + "rewards/final_reward": 1.3948014535785171, + "rewards/mask_iou_reward": 0.6974007267892586, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.097097098827362, + "rewards/thk_ans_format_reward": 1.0, + "step": 404, + "think_completion_length": 64.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.703125, + "epoch": 0.6829679595278246, + "grad_norm": 6.991066868890862, + "kl": 0.259765625, + "learning_rate": 8.63406408094435e-07, + "loss": 0.0003, + "reward": 2.830276131629944, + "reward_std": 0.557792603969574, + "rewards/final_reward": 0.96004537767179, + "rewards/mask_iou_reward": 0.480022688835895, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8302761018276215, + "rewards/thk_ans_format_reward": 1.0, + "step": 405, + "think_completion_length": 64.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.9375, + "epoch": 0.684654300168634, + "grad_norm": 14.478623985464413, + "kl": 0.2919921875, + "learning_rate": 8.630691399662732e-07, + "loss": 0.0003, + "reward": 2.990003228187561, + "reward_std": 0.2519157975912094, + "rewards/final_reward": 0.8406945511209645, + "rewards/mask_iou_reward": 0.42034727556048224, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.990003228187561, + "rewards/thk_ans_format_reward": 1.0, + "step": 406, + "think_completion_length": 59.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.234375, + "epoch": 0.6863406408094435, + "grad_norm": 4.662416823922717, + "kl": 0.431640625, + "learning_rate": 8.627318718381113e-07, + "loss": 0.0004, + "reward": 2.8567984104156494, + "reward_std": 0.23610374331474304, + "rewards/final_reward": 0.6864285578802869, + "rewards/mask_iou_reward": 0.34321427894014345, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.856798380613327, + "rewards/thk_ans_format_reward": 1.0, + "step": 407, + "think_completion_length": 67.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.78125, + "epoch": 0.688026981450253, + "grad_norm": 3.5185540402928988, + "kl": 0.314453125, + "learning_rate": 8.623946037099494e-07, + "loss": 0.0003, + "reward": 3.2333940267562866, + "reward_std": 0.27213188260793686, + "rewards/final_reward": 1.3256075781426653, + "rewards/mask_iou_reward": 0.6628037890713326, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2333939671516418, + "rewards/thk_ans_format_reward": 1.0, + "step": 408, + "think_completion_length": 65.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.921875, + "epoch": 0.6897133220910624, + "grad_norm": 58.126162154558536, + "kl": 0.2548828125, + "learning_rate": 8.620573355817875e-07, + "loss": 0.0002, + "reward": 3.4215975999832153, + "reward_std": 0.2836841717362404, + "rewards/final_reward": 1.3137488698038982, + "rewards/mask_iou_reward": 0.6568744349019491, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4215975999832153, + "rewards/thk_ans_format_reward": 1.0, + "step": 409, + "think_completion_length": 52.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.390625, + "epoch": 0.6913996627318718, + "grad_norm": 4.944433841842423, + "kl": 0.251953125, + "learning_rate": 8.617200674536255e-07, + "loss": 0.0003, + "reward": 3.0755574703216553, + "reward_std": 0.3461499884724617, + "rewards/final_reward": 1.0501447498455052, + "rewards/mask_iou_reward": 0.5250723749227526, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0755574703216553, + "rewards/thk_ans_format_reward": 1.0, + "step": 410, + "think_completion_length": 62.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.109375, + "epoch": 0.6930860033726813, + "grad_norm": 22.612041850982212, + "kl": 0.28125, + "learning_rate": 8.613827993254636e-07, + "loss": 0.0003, + "reward": 3.604109525680542, + "reward_std": 0.19016021490097046, + "rewards/final_reward": 1.7495145669154235, + "rewards/mask_iou_reward": 0.8747572834577118, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6041094660758972, + "rewards/thk_ans_format_reward": 1.0, + "step": 411, + "think_completion_length": 62.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.359375, + "epoch": 0.6947723440134908, + "grad_norm": 7.973198624210423, + "kl": 0.24951171875, + "learning_rate": 8.610455311973018e-07, + "loss": 0.0002, + "reward": 3.139096975326538, + "reward_std": 0.1365029662847519, + "rewards/final_reward": 0.7252312308037943, + "rewards/mask_iou_reward": 0.36261561540189713, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1390970349311829, + "rewards/thk_ans_format_reward": 1.0, + "step": 412, + "think_completion_length": 58.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.875, + "epoch": 0.6964586846543002, + "grad_norm": 9.023858103950877, + "kl": 0.2607421875, + "learning_rate": 8.607082630691399e-07, + "loss": 0.0003, + "reward": 3.077267289161682, + "reward_std": 0.08271101489663124, + "rewards/final_reward": 1.5952865266384522, + "rewards/mask_iou_reward": 0.7976432633192261, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0772673785686493, + "rewards/thk_ans_format_reward": 1.0, + "step": 413, + "think_completion_length": 65.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.71875, + "epoch": 0.6981450252951096, + "grad_norm": 5.158677504548988, + "kl": 0.2666015625, + "learning_rate": 8.603709949409781e-07, + "loss": 0.0003, + "reward": 2.795508623123169, + "reward_std": 0.0809487490914762, + "rewards/final_reward": 0.917479949771516, + "rewards/mask_iou_reward": 0.458739974885758, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7955086827278137, + "rewards/thk_ans_format_reward": 1.0, + "step": 414, + "think_completion_length": 54.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.5, + "epoch": 0.6998313659359191, + "grad_norm": 5.030120907805138, + "kl": 0.3251953125, + "learning_rate": 8.600337268128162e-07, + "loss": 0.0003, + "reward": 3.044167995452881, + "reward_std": 0.06912581558572128, + "rewards/final_reward": 1.9550748572924652, + "rewards/mask_iou_reward": 0.9775374286462326, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0441679954528809, + "rewards/thk_ans_format_reward": 1.0, + "step": 415, + "think_completion_length": 53.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.03125, + "epoch": 0.7015177065767285, + "grad_norm": 10.346031417599695, + "kl": 0.294921875, + "learning_rate": 8.596964586846543e-07, + "loss": 0.0003, + "reward": 3.1020259857177734, + "reward_std": 0.1669246181845665, + "rewards/final_reward": 1.4006585781103693, + "rewards/mask_iou_reward": 0.7003292890551847, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.102025881409645, + "rewards/thk_ans_format_reward": 1.0, + "step": 416, + "think_completion_length": 50.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.5, + "epoch": 0.7032040472175379, + "grad_norm": 5.559416835383098, + "kl": 0.267578125, + "learning_rate": 8.593591905564925e-07, + "loss": 0.0003, + "reward": 2.8304479122161865, + "reward_std": 0.1307654045522213, + "rewards/final_reward": 0.2791251550299971, + "rewards/mask_iou_reward": 0.13956257751499854, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8304478824138641, + "rewards/thk_ans_format_reward": 1.0, + "step": 417, + "think_completion_length": 62.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.765625, + "epoch": 0.7048903878583473, + "grad_norm": 16.700794405142094, + "kl": 0.271484375, + "learning_rate": 8.590219224283305e-07, + "loss": 0.0003, + "reward": 2.828887462615967, + "reward_std": 0.2920425906777382, + "rewards/final_reward": 0.5944796988166807, + "rewards/mask_iou_reward": 0.29723984940834036, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8288873136043549, + "rewards/thk_ans_format_reward": 1.0, + "step": 418, + "think_completion_length": 60.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.625, + "epoch": 0.7065767284991569, + "grad_norm": 5.229023826318465, + "kl": 0.287109375, + "learning_rate": 8.586846543001685e-07, + "loss": 0.0003, + "reward": 2.6701961755752563, + "reward_std": 0.5146700888872147, + "rewards/final_reward": 0.768534040115647, + "rewards/mask_iou_reward": 0.3842670200578235, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.701446145772934, + "rewards/thk_ans_format_reward": 0.984375, + "step": 419, + "think_completion_length": 55.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.03125, + "epoch": 0.7082630691399663, + "grad_norm": 4.366390117260031, + "kl": 0.2724609375, + "learning_rate": 8.583473861720067e-07, + "loss": 0.0003, + "reward": 2.7694051265716553, + "reward_std": 0.35564553551375866, + "rewards/final_reward": 0.7180693794692734, + "rewards/mask_iou_reward": 0.3590346897346367, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 0.8319052159786224, + "rewards/thk_ans_format_reward": 0.96875, + "step": 420, + "think_completion_length": 61.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.5625, + "epoch": 0.7099494097807757, + "grad_norm": 8.26275894078137, + "kl": 0.28515625, + "learning_rate": 8.580101180438448e-07, + "loss": 0.0003, + "reward": 3.165099620819092, + "reward_std": 0.32927022874355316, + "rewards/final_reward": 0.9518278358961747, + "rewards/mask_iou_reward": 0.47591391794808735, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1650997400283813, + "rewards/thk_ans_format_reward": 1.0, + "step": 421, + "think_completion_length": 55.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.328125, + "epoch": 0.7116357504215851, + "grad_norm": 4.796511312898097, + "kl": 0.3564453125, + "learning_rate": 8.576728499156829e-07, + "loss": 0.0004, + "reward": 2.997436285018921, + "reward_std": 0.3313639760017395, + "rewards/final_reward": 0.5018222892159665, + "rewards/mask_iou_reward": 0.25091114460798325, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9974363744258881, + "rewards/thk_ans_format_reward": 1.0, + "step": 422, + "think_completion_length": 58.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.71875, + "epoch": 0.7133220910623946, + "grad_norm": 17.89257487840126, + "kl": 0.29296875, + "learning_rate": 8.573355817875211e-07, + "loss": 0.0003, + "reward": 2.97784960269928, + "reward_std": 0.34186942130327225, + "rewards/final_reward": 1.1728364824516073, + "rewards/mask_iou_reward": 0.5864182412258037, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9778496026992798, + "rewards/thk_ans_format_reward": 1.0, + "step": 423, + "think_completion_length": 56.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.890625, + "epoch": 0.715008431703204, + "grad_norm": 27.214424601047234, + "kl": 0.294921875, + "learning_rate": 8.569983136593592e-07, + "loss": 0.0003, + "reward": 2.7833904027938843, + "reward_std": 0.28692834824323654, + "rewards/final_reward": 0.9084582272270576, + "rewards/mask_iou_reward": 0.4542291136135288, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7833903729915619, + "rewards/thk_ans_format_reward": 1.0, + "step": 424, + "think_completion_length": 56.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.8125, + "epoch": 0.7166947723440135, + "grad_norm": 40.38312340410711, + "kl": 0.33984375, + "learning_rate": 8.566610455311973e-07, + "loss": 0.0003, + "reward": 2.905064821243286, + "reward_std": 0.1572525054216385, + "rewards/final_reward": 1.1837076841446386, + "rewards/mask_iou_reward": 0.5918538420723193, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9050647914409637, + "rewards/thk_ans_format_reward": 1.0, + "step": 425, + "think_completion_length": 64.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.953125, + "epoch": 0.718381112984823, + "grad_norm": 7.976773584921408, + "kl": 0.6875, + "learning_rate": 8.563237774030355e-07, + "loss": 0.0007, + "reward": 3.05864155292511, + "reward_std": 0.3007048964500427, + "rewards/final_reward": 1.1670635620638414, + "rewards/mask_iou_reward": 0.5835317810319207, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.058641493320465, + "rewards/thk_ans_format_reward": 1.0, + "step": 426, + "think_completion_length": 60.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.296875, + "epoch": 0.7200674536256324, + "grad_norm": 8.273608489018805, + "kl": 0.29296875, + "learning_rate": 8.559865092748734e-07, + "loss": 0.0003, + "reward": 2.8656177520751953, + "reward_std": 0.36590053141117096, + "rewards/final_reward": 0.5663732712946641, + "rewards/mask_iou_reward": 0.28318663564733204, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8656177222728729, + "rewards/thk_ans_format_reward": 1.0, + "step": 427, + "think_completion_length": 53.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.65625, + "epoch": 0.7217537942664418, + "grad_norm": 9.39576308947908, + "kl": 0.2861328125, + "learning_rate": 8.556492411467115e-07, + "loss": 0.0003, + "reward": 2.8028112649917603, + "reward_std": 0.22308364510536194, + "rewards/final_reward": 1.1169452408649283, + "rewards/mask_iou_reward": 0.5584726204324642, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8028113842010498, + "rewards/thk_ans_format_reward": 1.0, + "step": 428, + "think_completion_length": 49.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.59375, + "epoch": 0.7234401349072512, + "grad_norm": 5.036837067527288, + "kl": 0.30859375, + "learning_rate": 8.553119730185497e-07, + "loss": 0.0003, + "reward": 3.52843701839447, + "reward_std": 0.2567114308476448, + "rewards/final_reward": 1.6959970281876124, + "rewards/mask_iou_reward": 0.8479985140938062, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5284370183944702, + "rewards/thk_ans_format_reward": 1.0, + "step": 429, + "think_completion_length": 55.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.671875, + "epoch": 0.7251264755480608, + "grad_norm": 8.45011833698191, + "kl": 0.294921875, + "learning_rate": 8.549747048903878e-07, + "loss": 0.0003, + "reward": 2.8881938457489014, + "reward_std": 0.11123907007277012, + "rewards/final_reward": 0.5909519443980289, + "rewards/mask_iou_reward": 0.29547597219901445, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8881938755512238, + "rewards/thk_ans_format_reward": 1.0, + "step": 430, + "think_completion_length": 62.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.5625, + "epoch": 0.7268128161888702, + "grad_norm": 5.126242185281033, + "kl": 0.3310546875, + "learning_rate": 8.546374367622259e-07, + "loss": 0.0003, + "reward": 3.2686800956726074, + "reward_std": 0.47582103312015533, + "rewards/final_reward": 1.1995088677204089, + "rewards/mask_iou_reward": 0.5997544338602044, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.268679916858673, + "rewards/thk_ans_format_reward": 1.0, + "step": 431, + "think_completion_length": 56.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.328125, + "epoch": 0.7284991568296796, + "grad_norm": 7.510954485318972, + "kl": 0.3037109375, + "learning_rate": 8.543001686340641e-07, + "loss": 0.0003, + "reward": 2.963119626045227, + "reward_std": 0.5091882646083832, + "rewards/final_reward": 1.1603252698717035, + "rewards/mask_iou_reward": 0.5801626349358517, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9631196856498718, + "rewards/thk_ans_format_reward": 1.0, + "step": 432, + "think_completion_length": 53.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.875, + "epoch": 0.730185497470489, + "grad_norm": 8.17404395724238, + "kl": 0.2451171875, + "learning_rate": 8.539629005059022e-07, + "loss": 0.0002, + "reward": 3.3582974672317505, + "reward_std": 0.27676521986722946, + "rewards/final_reward": 1.060991388104146, + "rewards/mask_iou_reward": 0.530495694052073, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.358297348022461, + "rewards/thk_ans_format_reward": 1.0, + "step": 433, + "think_completion_length": 48.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.921875, + "epoch": 0.7318718381112985, + "grad_norm": 5.1275072969602675, + "kl": 0.2978515625, + "learning_rate": 8.536256323777403e-07, + "loss": 0.0003, + "reward": 3.2699029445648193, + "reward_std": 0.12833164259791374, + "rewards/final_reward": 1.371142408100008, + "rewards/mask_iou_reward": 0.685571204050004, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.269902914762497, + "rewards/thk_ans_format_reward": 1.0, + "step": 434, + "think_completion_length": 63.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.40625, + "epoch": 0.7335581787521079, + "grad_norm": 8.844781927351123, + "kl": 0.2880859375, + "learning_rate": 8.532883642495783e-07, + "loss": 0.0005, + "reward": 3.6087805032730103, + "reward_std": 0.2516215443611145, + "rewards/final_reward": 1.46631224588285, + "rewards/mask_iou_reward": 0.733156122941425, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6087803840637207, + "rewards/thk_ans_format_reward": 1.0, + "step": 435, + "think_completion_length": 58.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.84375, + "epoch": 0.7352445193929174, + "grad_norm": 9.045015551736409, + "kl": 0.283203125, + "learning_rate": 8.529510961214164e-07, + "loss": 0.0003, + "reward": 3.1191418170928955, + "reward_std": 0.17643820121884346, + "rewards/final_reward": 0.5086317075081106, + "rewards/mask_iou_reward": 0.2543158537540553, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1191417574882507, + "rewards/thk_ans_format_reward": 1.0, + "step": 436, + "think_completion_length": 55.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.25, + "epoch": 0.7369308600337268, + "grad_norm": 8.044272316503301, + "kl": 0.322265625, + "learning_rate": 8.526138279932546e-07, + "loss": 0.0003, + "reward": 2.923910140991211, + "reward_std": 0.2595134302973747, + "rewards/final_reward": 0.39993271395890284, + "rewards/mask_iou_reward": 0.19996635697945142, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9239100217819214, + "rewards/thk_ans_format_reward": 1.0, + "step": 437, + "think_completion_length": 54.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.1875, + "epoch": 0.7386172006745363, + "grad_norm": 6.42343047551497, + "kl": 0.2958984375, + "learning_rate": 8.522765598650927e-07, + "loss": 0.0003, + "reward": 3.2516669034957886, + "reward_std": 0.3042123168706894, + "rewards/final_reward": 1.4425390635201096, + "rewards/mask_iou_reward": 0.7212695317600548, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2516670227050781, + "rewards/thk_ans_format_reward": 1.0, + "step": 438, + "think_completion_length": 58.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.3125, + "epoch": 0.7403035413153457, + "grad_norm": 4.765939573950677, + "kl": 0.2431640625, + "learning_rate": 8.519392917369308e-07, + "loss": 0.0002, + "reward": 2.7552138566970825, + "reward_std": 0.29110707342624664, + "rewards/final_reward": 0.7833496236061104, + "rewards/mask_iou_reward": 0.3916748118030552, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7552138268947601, + "rewards/thk_ans_format_reward": 1.0, + "step": 439, + "think_completion_length": 56.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.4375, + "epoch": 0.7419898819561551, + "grad_norm": 6.901156078063723, + "kl": 0.259765625, + "learning_rate": 8.51602023608769e-07, + "loss": 0.0003, + "reward": 3.238754630088806, + "reward_std": 0.36059945821762085, + "rewards/final_reward": 1.6652710435028883, + "rewards/mask_iou_reward": 0.8326355217514442, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.2543795108795166, + "rewards/thk_ans_format_reward": 1.0, + "step": 440, + "think_completion_length": 61.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.890625, + "epoch": 0.7436762225969646, + "grad_norm": 9.619707399796134, + "kl": 0.3603515625, + "learning_rate": 8.512647554806071e-07, + "loss": 0.0004, + "reward": 3.349504232406616, + "reward_std": 0.09203607961535454, + "rewards/final_reward": 1.811421625096705, + "rewards/mask_iou_reward": 0.9057108125483525, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3495042622089386, + "rewards/thk_ans_format_reward": 1.0, + "step": 441, + "think_completion_length": 52.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.59375, + "epoch": 0.7453625632377741, + "grad_norm": 7.184268284891987, + "kl": 0.3095703125, + "learning_rate": 8.509274873524452e-07, + "loss": 0.0003, + "reward": 3.019321084022522, + "reward_std": 0.3987215608358383, + "rewards/final_reward": 0.8732108529144026, + "rewards/mask_iou_reward": 0.4366054264572013, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0193210542201996, + "rewards/thk_ans_format_reward": 1.0, + "step": 442, + "think_completion_length": 47.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.015625, + "epoch": 0.7470489038785835, + "grad_norm": 12.515671581951322, + "kl": 0.326171875, + "learning_rate": 8.505902192242834e-07, + "loss": 0.0003, + "reward": 3.155945658683777, + "reward_std": 0.34589822590351105, + "rewards/final_reward": 1.1646884123020445, + "rewards/mask_iou_reward": 0.5823442061510222, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1559456586837769, + "rewards/thk_ans_format_reward": 1.0, + "step": 443, + "think_completion_length": 59.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.921875, + "epoch": 0.7487352445193929, + "grad_norm": 3.547481875341391, + "kl": 0.287109375, + "learning_rate": 8.502529510961213e-07, + "loss": 0.0003, + "reward": 2.903050661087036, + "reward_std": 0.07714477553963661, + "rewards/final_reward": 1.0310059078481788, + "rewards/mask_iou_reward": 0.5155029539240894, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9030508100986481, + "rewards/thk_ans_format_reward": 1.0, + "step": 444, + "think_completion_length": 52.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.1875, + "epoch": 0.7504215851602024, + "grad_norm": 9.083111303607465, + "kl": 0.298828125, + "learning_rate": 8.499156829679594e-07, + "loss": 0.0003, + "reward": 3.021065354347229, + "reward_std": 0.15533466637134552, + "rewards/final_reward": 1.4759822302666619, + "rewards/mask_iou_reward": 0.7379911151333309, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.021065354347229, + "rewards/thk_ans_format_reward": 1.0, + "step": 445, + "think_completion_length": 58.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.359375, + "epoch": 0.7521079258010118, + "grad_norm": 7.087376979581363, + "kl": 0.26171875, + "learning_rate": 8.495784148397976e-07, + "loss": 0.0003, + "reward": 3.4048237800598145, + "reward_std": 0.4206378608942032, + "rewards/final_reward": 1.2012251051824736, + "rewards/mask_iou_reward": 0.6006125525912368, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4048238396644592, + "rewards/thk_ans_format_reward": 1.0, + "step": 446, + "think_completion_length": 57.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.703125, + "epoch": 0.7537942664418212, + "grad_norm": 5.4476413106060875, + "kl": 0.24951171875, + "learning_rate": 8.492411467116357e-07, + "loss": 0.0003, + "reward": 3.3550602197647095, + "reward_std": 0.1978924423456192, + "rewards/final_reward": 1.5025127897357047, + "rewards/mask_iou_reward": 0.7512563948678523, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.355060338973999, + "rewards/thk_ans_format_reward": 1.0, + "step": 447, + "think_completion_length": 49.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.90625, + "epoch": 0.7554806070826307, + "grad_norm": 4.176777412533717, + "kl": 0.2626953125, + "learning_rate": 8.489038785834738e-07, + "loss": 0.0003, + "reward": 3.3239400386810303, + "reward_std": 0.23745188117027283, + "rewards/final_reward": 1.7510122330778342, + "rewards/mask_iou_reward": 0.8755061165389171, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3239399194717407, + "rewards/thk_ans_format_reward": 1.0, + "step": 448, + "think_completion_length": 52.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.8125, + "epoch": 0.7571669477234402, + "grad_norm": 3.8561049155278124, + "kl": 0.298828125, + "learning_rate": 8.48566610455312e-07, + "loss": 0.0003, + "reward": 3.1175063848495483, + "reward_std": 0.3207996618002653, + "rewards/final_reward": 1.6053231020652237, + "rewards/mask_iou_reward": 0.8026615510326118, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.1487562656402588, + "rewards/thk_ans_format_reward": 0.984375, + "step": 449, + "think_completion_length": 54.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.65625, + "epoch": 0.7588532883642496, + "grad_norm": 4.769355012789226, + "kl": 0.3505859375, + "learning_rate": 8.482293423271501e-07, + "loss": 0.0004, + "reward": 2.6317962408065796, + "reward_std": 0.16405843198299408, + "rewards/final_reward": 0.31454674175310304, + "rewards/mask_iou_reward": 0.15727337087655152, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6317960768938065, + "rewards/thk_ans_format_reward": 1.0, + "step": 450, + "think_completion_length": 56.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.265625, + "epoch": 0.760539629005059, + "grad_norm": 7.400810968987018, + "kl": 0.2763671875, + "learning_rate": 8.478920741989882e-07, + "loss": 0.0003, + "reward": 3.499594807624817, + "reward_std": 0.24827680736780167, + "rewards/final_reward": 1.292793728943769, + "rewards/mask_iou_reward": 0.6463968644718845, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4995947480201721, + "rewards/thk_ans_format_reward": 1.0, + "step": 451, + "think_completion_length": 50.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.59375, + "epoch": 0.7622259696458684, + "grad_norm": 6.445672712478356, + "kl": 0.341796875, + "learning_rate": 8.475548060708263e-07, + "loss": 0.0003, + "reward": 2.749513268470764, + "reward_std": 0.3254034221172333, + "rewards/final_reward": 0.9117902511222327, + "rewards/mask_iou_reward": 0.45589512556111633, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7495132386684418, + "rewards/thk_ans_format_reward": 1.0, + "step": 452, + "think_completion_length": 55.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.390625, + "epoch": 0.7639123102866779, + "grad_norm": 13.204711329933593, + "kl": 0.3173828125, + "learning_rate": 8.472175379426643e-07, + "loss": 0.0003, + "reward": 3.0490126609802246, + "reward_std": 0.40351493656635284, + "rewards/final_reward": 1.0314188630098042, + "rewards/mask_iou_reward": 0.5157094315049021, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0490127503871918, + "rewards/thk_ans_format_reward": 1.0, + "step": 453, + "think_completion_length": 57.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.375, + "epoch": 0.7655986509274874, + "grad_norm": 6.324030150115455, + "kl": 0.3359375, + "learning_rate": 8.468802698145024e-07, + "loss": 0.0003, + "reward": 2.9564003944396973, + "reward_std": 0.23579375445842743, + "rewards/final_reward": 0.7703477680311337, + "rewards/mask_iou_reward": 0.38517388401556685, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.956400454044342, + "rewards/thk_ans_format_reward": 1.0, + "step": 454, + "think_completion_length": 49.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.578125, + "epoch": 0.7672849915682968, + "grad_norm": 4.925706148560462, + "kl": 0.2919921875, + "learning_rate": 8.465430016863406e-07, + "loss": 0.0003, + "reward": 3.0482248067855835, + "reward_std": 0.3343783766031265, + "rewards/final_reward": 1.1269153962105007, + "rewards/mask_iou_reward": 0.5634576981052504, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0482248067855835, + "rewards/thk_ans_format_reward": 1.0, + "step": 455, + "think_completion_length": 51.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.625, + "epoch": 0.7689713322091062, + "grad_norm": 5.465123188001045, + "kl": 0.2822265625, + "learning_rate": 8.462057335581787e-07, + "loss": 0.0003, + "reward": 3.0701215267181396, + "reward_std": 0.14017308503389359, + "rewards/final_reward": 1.8171637650895573, + "rewards/mask_iou_reward": 0.9085818825447787, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.07012140750885, + "rewards/thk_ans_format_reward": 1.0, + "step": 456, + "think_completion_length": 52.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.828125, + "epoch": 0.7706576728499157, + "grad_norm": 4.005358062471752, + "kl": 0.3115234375, + "learning_rate": 8.458684654300168e-07, + "loss": 0.0003, + "reward": 2.987306594848633, + "reward_std": 0.3185913637280464, + "rewards/final_reward": 1.4051766924639586, + "rewards/mask_iou_reward": 0.7025883462319793, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9873065650463104, + "rewards/thk_ans_format_reward": 1.0, + "step": 457, + "think_completion_length": 49.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.140625, + "epoch": 0.7723440134907251, + "grad_norm": 11.194917130532907, + "kl": 0.30078125, + "learning_rate": 8.45531197301855e-07, + "loss": 0.0003, + "reward": 3.5356940031051636, + "reward_std": 0.1198413036763668, + "rewards/final_reward": 1.6100836459413137, + "rewards/mask_iou_reward": 0.8050418229706569, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5356940627098083, + "rewards/thk_ans_format_reward": 1.0, + "step": 458, + "think_completion_length": 56.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.609375, + "epoch": 0.7740303541315345, + "grad_norm": 4.153232928144624, + "kl": 0.322265625, + "learning_rate": 8.451939291736931e-07, + "loss": 0.0003, + "reward": 3.207147717475891, + "reward_std": 0.38715776801109314, + "rewards/final_reward": 1.0808576457404304, + "rewards/mask_iou_reward": 0.5404288228702152, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2071476578712463, + "rewards/thk_ans_format_reward": 1.0, + "step": 459, + "think_completion_length": 52.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.765625, + "epoch": 0.7757166947723441, + "grad_norm": 10.159888358886413, + "kl": 0.2685546875, + "learning_rate": 8.448566610455311e-07, + "loss": 0.0003, + "reward": 2.656245470046997, + "reward_std": 0.19464854151010513, + "rewards/final_reward": 0.43508841005784793, + "rewards/mask_iou_reward": 0.21754420502892396, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6562454402446747, + "rewards/thk_ans_format_reward": 1.0, + "step": 460, + "think_completion_length": 49.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.5625, + "epoch": 0.7774030354131535, + "grad_norm": 4.856925040426539, + "kl": 0.3203125, + "learning_rate": 8.445193929173693e-07, + "loss": 0.0003, + "reward": 3.1314727067947388, + "reward_std": 0.17312223464250565, + "rewards/final_reward": 1.608206343195508, + "rewards/mask_iou_reward": 0.804103171597754, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1314727067947388, + "rewards/thk_ans_format_reward": 1.0, + "step": 461, + "think_completion_length": 58.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.546875, + "epoch": 0.7790893760539629, + "grad_norm": 14.882051882248911, + "kl": 0.291015625, + "learning_rate": 8.441821247892073e-07, + "loss": 0.0003, + "reward": 2.922551989555359, + "reward_std": 0.24346740171313286, + "rewards/final_reward": 1.2441537792490682, + "rewards/mask_iou_reward": 0.6220768896245341, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9225519299507141, + "rewards/thk_ans_format_reward": 1.0, + "step": 462, + "think_completion_length": 56.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.5, + "epoch": 0.7807757166947723, + "grad_norm": 7.4432447360727005, + "kl": 0.28662109375, + "learning_rate": 8.438448566610455e-07, + "loss": 0.0003, + "reward": 3.0491660833358765, + "reward_std": 0.3176800534129143, + "rewards/final_reward": 0.5361547861618222, + "rewards/mask_iou_reward": 0.2680773930809111, + "rewards/sam_format_reward": 0.9375, + "rewards/sam_reward_func_ultra": 1.1116660237312317, + "rewards/thk_ans_format_reward": 1.0, + "step": 463, + "think_completion_length": 61.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.71875, + "epoch": 0.7824620573355818, + "grad_norm": 8.644391998731278, + "kl": 0.322265625, + "learning_rate": 8.435075885328836e-07, + "loss": 0.0003, + "reward": 3.0543339252471924, + "reward_std": 0.13533685728907585, + "rewards/final_reward": 1.2730188577493136, + "rewards/mask_iou_reward": 0.6365094288746568, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0543338507413864, + "rewards/thk_ans_format_reward": 1.0, + "step": 464, + "think_completion_length": 53.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.140625, + "epoch": 0.7841483979763912, + "grad_norm": 236.67023834501367, + "kl": 0.2685546875, + "learning_rate": 8.431703204047217e-07, + "loss": 0.0003, + "reward": 3.6658883094787598, + "reward_std": 0.15900836139917374, + "rewards/final_reward": 1.5048619609851641, + "rewards/mask_iou_reward": 0.7524309804925821, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.665888249874115, + "rewards/thk_ans_format_reward": 1.0, + "step": 465, + "think_completion_length": 60.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.609375, + "epoch": 0.7858347386172007, + "grad_norm": 4.741441128552988, + "kl": 0.2890625, + "learning_rate": 8.428330522765599e-07, + "loss": 0.0003, + "reward": 3.2664116621017456, + "reward_std": 0.20561707019805908, + "rewards/final_reward": 1.6148482767720258, + "rewards/mask_iou_reward": 0.8074241383860129, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.266411691904068, + "rewards/thk_ans_format_reward": 1.0, + "step": 466, + "think_completion_length": 53.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.984375, + "epoch": 0.7875210792580101, + "grad_norm": 6.036213971503519, + "kl": 0.310546875, + "learning_rate": 8.42495784148398e-07, + "loss": 0.0003, + "reward": 3.273001790046692, + "reward_std": 0.23280290514230728, + "rewards/final_reward": 1.3179749980699165, + "rewards/mask_iou_reward": 0.6589874990349582, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2730017304420471, + "rewards/thk_ans_format_reward": 1.0, + "step": 467, + "think_completion_length": 58.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.84375, + "epoch": 0.7892074198988196, + "grad_norm": 7.819806233732989, + "kl": 0.322265625, + "learning_rate": 8.42158516020236e-07, + "loss": 0.0003, + "reward": 2.7713377475738525, + "reward_std": 0.21657298505306244, + "rewards/final_reward": 0.8699708025221008, + "rewards/mask_iou_reward": 0.4349854012610504, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7713376581668854, + "rewards/thk_ans_format_reward": 1.0, + "step": 468, + "think_completion_length": 59.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.90625, + "epoch": 0.790893760539629, + "grad_norm": 5.404044503889141, + "kl": 0.3291015625, + "learning_rate": 8.418212478920742e-07, + "loss": 0.0003, + "reward": 3.161504626274109, + "reward_std": 0.2850091755390167, + "rewards/final_reward": 1.14387468377842, + "rewards/mask_iou_reward": 0.57193734188921, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1615045070648193, + "rewards/thk_ans_format_reward": 1.0, + "step": 469, + "think_completion_length": 56.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.96875, + "epoch": 0.7925801011804384, + "grad_norm": 7.604389194636123, + "kl": 0.3642578125, + "learning_rate": 8.414839797639123e-07, + "loss": 0.0004, + "reward": 2.8981428146362305, + "reward_std": 0.36169466376304626, + "rewards/final_reward": 1.0320069000804137, + "rewards/mask_iou_reward": 0.5160034500402069, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8981429636478424, + "rewards/thk_ans_format_reward": 1.0, + "step": 470, + "think_completion_length": 65.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.40625, + "epoch": 0.7942664418212478, + "grad_norm": 12.40271629362927, + "kl": 0.27734375, + "learning_rate": 8.411467116357503e-07, + "loss": 0.0003, + "reward": 3.4766829013824463, + "reward_std": 0.17510409653186798, + "rewards/final_reward": 1.563823732939018, + "rewards/mask_iou_reward": 0.781911866469509, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4766828417778015, + "rewards/thk_ans_format_reward": 1.0, + "step": 471, + "think_completion_length": 58.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.3125, + "epoch": 0.7959527824620574, + "grad_norm": 4.105674434527155, + "kl": 0.556640625, + "learning_rate": 8.408094435075885e-07, + "loss": 0.0006, + "reward": 3.3228079080581665, + "reward_std": 0.14854015782475471, + "rewards/final_reward": 1.5614363244482155, + "rewards/mask_iou_reward": 0.7807181622241077, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3228079676628113, + "rewards/thk_ans_format_reward": 1.0, + "step": 472, + "think_completion_length": 53.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.515625, + "epoch": 0.7976391231028668, + "grad_norm": 14.121466586094188, + "kl": 0.3408203125, + "learning_rate": 8.404721753794266e-07, + "loss": 0.0003, + "reward": 2.8374698162078857, + "reward_std": 0.2110204752534628, + "rewards/final_reward": 0.6061206330509881, + "rewards/mask_iou_reward": 0.30306031652549403, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8374697864055634, + "rewards/thk_ans_format_reward": 1.0, + "step": 473, + "think_completion_length": 55.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.0625, + "epoch": 0.7993254637436762, + "grad_norm": 9.046995607060756, + "kl": 0.3359375, + "learning_rate": 8.401349072512647e-07, + "loss": 0.0003, + "reward": 3.3081858158111572, + "reward_std": 0.3434343636035919, + "rewards/final_reward": 1.4127900623675984, + "rewards/mask_iou_reward": 0.7063950311837992, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3081855773925781, + "rewards/thk_ans_format_reward": 1.0, + "step": 474, + "think_completion_length": 59.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.328125, + "epoch": 0.8010118043844857, + "grad_norm": 16.95326296908787, + "kl": 0.2724609375, + "learning_rate": 8.397976391231029e-07, + "loss": 0.0003, + "reward": 3.488841414451599, + "reward_std": 0.2021305412054062, + "rewards/final_reward": 1.6205699107923395, + "rewards/mask_iou_reward": 0.8102849553961697, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4888412356376648, + "rewards/thk_ans_format_reward": 1.0, + "step": 475, + "think_completion_length": 53.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.9375, + "epoch": 0.8026981450252951, + "grad_norm": 8.257579139699448, + "kl": 0.353515625, + "learning_rate": 8.39460370994941e-07, + "loss": 0.0004, + "reward": 3.1359875202178955, + "reward_std": 0.1910172551870346, + "rewards/final_reward": 0.6040533924945672, + "rewards/mask_iou_reward": 0.3020266962472836, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1359875202178955, + "rewards/thk_ans_format_reward": 1.0, + "step": 476, + "think_completion_length": 60.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.265625, + "epoch": 0.8043844856661045, + "grad_norm": 8.58037790808054, + "kl": 0.2919921875, + "learning_rate": 8.39123102866779e-07, + "loss": 0.0003, + "reward": 3.001362442970276, + "reward_std": 0.11581205576658249, + "rewards/final_reward": 0.34856413924459007, + "rewards/mask_iou_reward": 0.17428206962229503, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0013624429702759, + "rewards/thk_ans_format_reward": 1.0, + "step": 477, + "think_completion_length": 56.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.265625, + "epoch": 0.806070826306914, + "grad_norm": 3.9427534797600767, + "kl": 0.3232421875, + "learning_rate": 8.387858347386172e-07, + "loss": 0.0003, + "reward": 3.370658040046692, + "reward_std": 0.23730356991291046, + "rewards/final_reward": 1.6140919982067383, + "rewards/mask_iou_reward": 0.8070459991033692, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.370658040046692, + "rewards/thk_ans_format_reward": 1.0, + "step": 478, + "think_completion_length": 56.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.390625, + "epoch": 0.8077571669477235, + "grad_norm": 3.3744960399290114, + "kl": 0.2802734375, + "learning_rate": 8.384485666104552e-07, + "loss": 0.0003, + "reward": 2.2663588523864746, + "reward_std": 0.2115717504057102, + "rewards/final_reward": 0.26012163497015883, + "rewards/mask_iou_reward": 0.13006081748507942, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.26635879278182983, + "rewards/thk_ans_format_reward": 1.0, + "step": 479, + "think_completion_length": 56.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.765625, + "epoch": 0.8094435075885329, + "grad_norm": 3.6117344364232564, + "kl": 0.2666015625, + "learning_rate": 8.381112984822933e-07, + "loss": 0.0003, + "reward": 2.8762000799179077, + "reward_std": 0.388136625289917, + "rewards/final_reward": 0.9140486507211179, + "rewards/mask_iou_reward": 0.4570243253605589, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8762001395225525, + "rewards/thk_ans_format_reward": 1.0, + "step": 480, + "think_completion_length": 63.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.65625, + "epoch": 0.8111298482293423, + "grad_norm": 12.274297517004138, + "kl": 0.478515625, + "learning_rate": 8.377740303541315e-07, + "loss": 0.0005, + "reward": 3.2747581005096436, + "reward_std": 0.26797255873680115, + "rewards/final_reward": 1.6817001867058976, + "rewards/mask_iou_reward": 0.8408500933529488, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.2903832793235779, + "rewards/thk_ans_format_reward": 1.0, + "step": 481, + "think_completion_length": 52.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.09375, + "epoch": 0.8128161888701517, + "grad_norm": 5.527163191205217, + "kl": 0.3662109375, + "learning_rate": 8.374367622259696e-07, + "loss": 0.0004, + "reward": 2.8398313522338867, + "reward_std": 0.3691897839307785, + "rewards/final_reward": 0.5070145637787651, + "rewards/mask_iou_reward": 0.25350728188938254, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8398312926292419, + "rewards/thk_ans_format_reward": 1.0, + "step": 482, + "think_completion_length": 60.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.4375, + "epoch": 0.8145025295109612, + "grad_norm": 9.457144918818893, + "kl": 0.3134765625, + "learning_rate": 8.370994940978077e-07, + "loss": 0.0003, + "reward": 2.8676928281784058, + "reward_std": 0.21387110650539398, + "rewards/final_reward": 0.7955313358824923, + "rewards/mask_iou_reward": 0.3977656679412461, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8676928579807281, + "rewards/thk_ans_format_reward": 1.0, + "step": 483, + "think_completion_length": 56.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.484375, + "epoch": 0.8161888701517707, + "grad_norm": 5.765839208328575, + "kl": 0.291015625, + "learning_rate": 8.367622259696459e-07, + "loss": 0.0003, + "reward": 3.332192301750183, + "reward_std": 0.2074635624885559, + "rewards/final_reward": 1.3884839448598976, + "rewards/mask_iou_reward": 0.6942419724299488, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3321923613548279, + "rewards/thk_ans_format_reward": 1.0, + "step": 484, + "think_completion_length": 60.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.515625, + "epoch": 0.8178752107925801, + "grad_norm": 13.058251127540307, + "kl": 0.291015625, + "learning_rate": 8.364249578414839e-07, + "loss": 0.0003, + "reward": 3.4136351346969604, + "reward_std": 0.1272813342511654, + "rewards/final_reward": 1.6252411969988985, + "rewards/mask_iou_reward": 0.8126205984994492, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.413635015487671, + "rewards/thk_ans_format_reward": 1.0, + "step": 485, + "think_completion_length": 53.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.484375, + "epoch": 0.8195615514333895, + "grad_norm": 4.59058157342402, + "kl": 0.2724609375, + "learning_rate": 8.360876897133221e-07, + "loss": 0.0003, + "reward": 2.841328501701355, + "reward_std": 0.2238992303609848, + "rewards/final_reward": 0.4358989520701159, + "rewards/mask_iou_reward": 0.21794947603505796, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8413284122943878, + "rewards/thk_ans_format_reward": 1.0, + "step": 486, + "think_completion_length": 57.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.03125, + "epoch": 0.821247892074199, + "grad_norm": 5.2583036747207865, + "kl": 0.296875, + "learning_rate": 8.357504215851602e-07, + "loss": 0.0003, + "reward": 3.279032826423645, + "reward_std": 0.34882715344429016, + "rewards/final_reward": 1.1094078905881353, + "rewards/mask_iou_reward": 0.5547039452940676, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2790327668190002, + "rewards/thk_ans_format_reward": 1.0, + "step": 487, + "think_completion_length": 56.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.1875, + "epoch": 0.8229342327150084, + "grad_norm": 12.262889885729598, + "kl": 0.5107421875, + "learning_rate": 8.354131534569982e-07, + "loss": 0.0005, + "reward": 2.6642863750457764, + "reward_std": 0.4135005921125412, + "rewards/final_reward": 0.902978774756644, + "rewards/mask_iou_reward": 0.451489387378322, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.679911345243454, + "rewards/thk_ans_format_reward": 1.0, + "step": 488, + "think_completion_length": 57.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.296875, + "epoch": 0.8246205733558178, + "grad_norm": 6.485249587949392, + "kl": 0.322265625, + "learning_rate": 8.350758853288364e-07, + "loss": 0.0003, + "reward": 3.394331693649292, + "reward_std": 0.16855868697166443, + "rewards/final_reward": 1.1107550160110016, + "rewards/mask_iou_reward": 0.5553775080055008, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3943317532539368, + "rewards/thk_ans_format_reward": 1.0, + "step": 489, + "think_completion_length": 57.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.453125, + "epoch": 0.8263069139966274, + "grad_norm": 8.664907971971088, + "kl": 0.34765625, + "learning_rate": 8.347386172006745e-07, + "loss": 0.0003, + "reward": 2.957345962524414, + "reward_std": 0.19981549307703972, + "rewards/final_reward": 0.8727845301331817, + "rewards/mask_iou_reward": 0.43639226506659085, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9573458433151245, + "rewards/thk_ans_format_reward": 1.0, + "step": 490, + "think_completion_length": 65.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.703125, + "epoch": 0.8279932546374368, + "grad_norm": 5.276331158403494, + "kl": 0.3896484375, + "learning_rate": 8.344013490725126e-07, + "loss": 0.0004, + "reward": 3.023526191711426, + "reward_std": 0.3681245595216751, + "rewards/final_reward": 0.7070453517651918, + "rewards/mask_iou_reward": 0.3535226758825959, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 1.05477637052536, + "rewards/thk_ans_format_reward": 1.0, + "step": 491, + "think_completion_length": 53.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.84375, + "epoch": 0.8296795952782462, + "grad_norm": 15.27579742522537, + "kl": 0.3515625, + "learning_rate": 8.340640809443508e-07, + "loss": 0.0004, + "reward": 3.013556480407715, + "reward_std": 0.594033494591713, + "rewards/final_reward": 0.626667284580518, + "rewards/mask_iou_reward": 0.313333642290259, + "rewards/sam_format_reward": 0.9375, + "rewards/sam_reward_func_ultra": 1.0760565400123596, + "rewards/thk_ans_format_reward": 1.0, + "step": 492, + "think_completion_length": 55.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.5, + "epoch": 0.8313659359190556, + "grad_norm": 4.6907462569949585, + "kl": 0.5, + "learning_rate": 8.337268128161888e-07, + "loss": 0.0005, + "reward": 2.9373987913131714, + "reward_std": 0.1304899863898754, + "rewards/final_reward": 0.9829659043346416, + "rewards/mask_iou_reward": 0.4914829521673208, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9373987317085266, + "rewards/thk_ans_format_reward": 1.0, + "step": 493, + "think_completion_length": 58.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.6875, + "epoch": 0.8330522765598651, + "grad_norm": 9.07332574925447, + "kl": 0.3671875, + "learning_rate": 8.333895446880269e-07, + "loss": 0.0004, + "reward": 3.1510684490203857, + "reward_std": 0.17891812324523926, + "rewards/final_reward": 0.48630485725933903, + "rewards/mask_iou_reward": 0.24315242862966951, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1510685086250305, + "rewards/thk_ans_format_reward": 1.0, + "step": 494, + "think_completion_length": 58.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.796875, + "epoch": 0.8347386172006745, + "grad_norm": 4.086735711528925, + "kl": 0.3623046875, + "learning_rate": 8.330522765598651e-07, + "loss": 0.0004, + "reward": 3.338440179824829, + "reward_std": 0.2079987023025751, + "rewards/final_reward": 0.8487996371223201, + "rewards/mask_iou_reward": 0.42439981856116005, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.3540650606155396, + "rewards/thk_ans_format_reward": 1.0, + "step": 495, + "think_completion_length": 58.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.125, + "epoch": 0.836424957841484, + "grad_norm": 5.467494355508177, + "kl": 0.3486328125, + "learning_rate": 8.327150084317032e-07, + "loss": 0.0003, + "reward": 2.9036959409713745, + "reward_std": 0.0915520153939724, + "rewards/final_reward": 0.5634759685080848, + "rewards/mask_iou_reward": 0.2817379842540424, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9036959111690521, + "rewards/thk_ans_format_reward": 1.0, + "step": 496, + "think_completion_length": 55.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.734375, + "epoch": 0.8381112984822934, + "grad_norm": 4.609982837739563, + "kl": 0.3603515625, + "learning_rate": 8.323777403035412e-07, + "loss": 0.0004, + "reward": 2.9951345920562744, + "reward_std": 0.14687485992908478, + "rewards/final_reward": 0.5120508227904494, + "rewards/mask_iou_reward": 0.2560254113952247, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.995134562253952, + "rewards/thk_ans_format_reward": 1.0, + "step": 497, + "think_completion_length": 57.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.765625, + "epoch": 0.8397976391231029, + "grad_norm": 4.49715286866878, + "kl": 0.6982421875, + "learning_rate": 8.320404721753794e-07, + "loss": 0.0007, + "reward": 2.702091932296753, + "reward_std": 0.06957501918077469, + "rewards/final_reward": 0.5184969514126482, + "rewards/mask_iou_reward": 0.2592484757063241, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.702091857790947, + "rewards/thk_ans_format_reward": 1.0, + "step": 498, + "think_completion_length": 57.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.21875, + "epoch": 0.8414839797639123, + "grad_norm": 5.876436406383071, + "kl": 0.41015625, + "learning_rate": 8.317032040472175e-07, + "loss": 0.0004, + "reward": 3.532025933265686, + "reward_std": 0.31206123530864716, + "rewards/final_reward": 1.6488765208842402, + "rewards/mask_iou_reward": 0.8244382604421201, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5320259928703308, + "rewards/thk_ans_format_reward": 1.0, + "step": 499, + "think_completion_length": 60.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.609375, + "epoch": 0.8431703204047217, + "grad_norm": 8.430320529931935, + "kl": 0.3818359375, + "learning_rate": 8.313659359190556e-07, + "loss": 0.0004, + "reward": 3.1462095975875854, + "reward_std": 0.14214863628149033, + "rewards/final_reward": 1.4178498719028585, + "rewards/mask_iou_reward": 0.7089249359514292, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1462095975875854, + "rewards/thk_ans_format_reward": 1.0, + "step": 500, + "think_completion_length": 55.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.984375, + "epoch": 0.8448566610455311, + "grad_norm": 11.055793749524597, + "kl": 0.37109375, + "learning_rate": 8.310286677908938e-07, + "loss": 0.0004, + "reward": 3.4042888879776, + "reward_std": 0.2672403007745743, + "rewards/final_reward": 1.3030006843771489, + "rewards/mask_iou_reward": 0.6515003421885744, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4042889475822449, + "rewards/thk_ans_format_reward": 1.0, + "step": 501, + "think_completion_length": 59.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.15625, + "epoch": 0.8465430016863407, + "grad_norm": 4.353326318987238, + "kl": 0.8125, + "learning_rate": 8.306913996627318e-07, + "loss": 0.0008, + "reward": 2.6968055963516235, + "reward_std": 0.42137467861175537, + "rewards/final_reward": 0.5881671125422409, + "rewards/mask_iou_reward": 0.3264447243490181, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.7124305069446564, + "rewards/thk_ans_format_reward": 1.0, + "step": 502, + "think_completion_length": 57.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.453125, + "epoch": 0.8482293423271501, + "grad_norm": 3.952799501986196, + "kl": 0.3857421875, + "learning_rate": 8.303541315345699e-07, + "loss": 0.0004, + "reward": 3.111401081085205, + "reward_std": 0.23835711553692818, + "rewards/final_reward": 1.4018418315631394, + "rewards/mask_iou_reward": 0.7009209157815697, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.111401081085205, + "rewards/thk_ans_format_reward": 1.0, + "step": 503, + "think_completion_length": 58.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.921875, + "epoch": 0.8499156829679595, + "grad_norm": 6.092795086100114, + "kl": 0.408203125, + "learning_rate": 8.300168634064081e-07, + "loss": 0.0004, + "reward": 3.0179593563079834, + "reward_std": 0.43792441487312317, + "rewards/final_reward": 1.1153791394943096, + "rewards/mask_iou_reward": 0.5576895697471548, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0179592669010162, + "rewards/thk_ans_format_reward": 1.0, + "step": 504, + "think_completion_length": 63.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.109375, + "epoch": 0.851602023608769, + "grad_norm": 5.84142921702047, + "kl": 0.38671875, + "learning_rate": 8.296795952782462e-07, + "loss": 0.0004, + "reward": 2.6653823852539062, + "reward_std": 0.3925536721944809, + "rewards/final_reward": 1.2697986540779345, + "rewards/mask_iou_reward": 0.6348993270389672, + "rewards/sam_format_reward": 0.875, + "rewards/sam_reward_func_ultra": 0.790382444858551, + "rewards/thk_ans_format_reward": 1.0, + "step": 505, + "think_completion_length": 60.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.421875, + "epoch": 0.8532883642495784, + "grad_norm": 9.174156204307186, + "kl": 0.3623046875, + "learning_rate": 8.293423271500842e-07, + "loss": 0.0004, + "reward": 3.216946005821228, + "reward_std": 0.12585578113794327, + "rewards/final_reward": 1.8653463056949018, + "rewards/mask_iou_reward": 0.9326731528474509, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.216946005821228, + "rewards/thk_ans_format_reward": 1.0, + "step": 506, + "think_completion_length": 56.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.125, + "epoch": 0.8549747048903878, + "grad_norm": 4.338273951521515, + "kl": 0.4072265625, + "learning_rate": 8.290050590219224e-07, + "loss": 0.0004, + "reward": 2.273374557495117, + "reward_std": 0.08910224586725235, + "rewards/final_reward": 0.3796306792781109, + "rewards/mask_iou_reward": 0.18981533963905545, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.2733745574951172, + "rewards/thk_ans_format_reward": 1.0, + "step": 507, + "think_completion_length": 64.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.53125, + "epoch": 0.8566610455311973, + "grad_norm": 4.048280846796833, + "kl": 0.3779296875, + "learning_rate": 8.286677908937605e-07, + "loss": 0.0004, + "reward": 3.340023159980774, + "reward_std": 0.14114241860806942, + "rewards/final_reward": 1.3392693923469996, + "rewards/mask_iou_reward": 0.6696346961734998, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3400232195854187, + "rewards/thk_ans_format_reward": 1.0, + "step": 508, + "think_completion_length": 62.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.5, + "epoch": 0.8583473861720068, + "grad_norm": 6.439504346827565, + "kl": 0.4326171875, + "learning_rate": 8.283305227655986e-07, + "loss": 0.0004, + "reward": 3.283868193626404, + "reward_std": 0.1502309814095497, + "rewards/final_reward": 1.4718184265458358, + "rewards/mask_iou_reward": 0.7359092132729179, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2838680744171143, + "rewards/thk_ans_format_reward": 1.0, + "step": 509, + "think_completion_length": 65.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.0625, + "epoch": 0.8600337268128162, + "grad_norm": 6.735960884512077, + "kl": 0.40234375, + "learning_rate": 8.279932546374367e-07, + "loss": 0.0004, + "reward": 2.945590615272522, + "reward_std": 0.16057924553751945, + "rewards/final_reward": 1.1542222895062477, + "rewards/mask_iou_reward": 0.5771111447531239, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9455906599760056, + "rewards/thk_ans_format_reward": 1.0, + "step": 510, + "think_completion_length": 63.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.5625, + "epoch": 0.8617200674536256, + "grad_norm": 5.712862299060118, + "kl": 0.4208984375, + "learning_rate": 8.276559865092748e-07, + "loss": 0.0004, + "reward": 2.683882713317871, + "reward_std": 0.37942691147327423, + "rewards/final_reward": 0.5198905041900906, + "rewards/mask_iou_reward": 0.2599452520950453, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6838827431201935, + "rewards/thk_ans_format_reward": 1.0, + "step": 511, + "think_completion_length": 63.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.46875, + "epoch": 0.863406408094435, + "grad_norm": 28.652231899021555, + "kl": 0.3955078125, + "learning_rate": 8.27318718381113e-07, + "loss": 0.0004, + "reward": 2.9515405893325806, + "reward_std": 0.3559226468205452, + "rewards/final_reward": 1.0275246177182993, + "rewards/mask_iou_reward": 0.5137623088591496, + "rewards/sam_format_reward": 0.890625, + "rewards/sam_reward_func_ultra": 1.060915619134903, + "rewards/thk_ans_format_reward": 1.0, + "step": 512, + "think_completion_length": 66.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.171875, + "epoch": 0.8650927487352446, + "grad_norm": 4.1058269502843645, + "kl": 0.3701171875, + "learning_rate": 8.269814502529511e-07, + "loss": 0.0004, + "reward": 2.8999738693237305, + "reward_std": 0.0674455501139164, + "rewards/final_reward": 0.47537297320235095, + "rewards/mask_iou_reward": 0.23768648660117547, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8999738991260529, + "rewards/thk_ans_format_reward": 1.0, + "step": 513, + "think_completion_length": 61.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.328125, + "epoch": 0.866779089376054, + "grad_norm": 2.9463795478386157, + "kl": 0.376953125, + "learning_rate": 8.266441821247892e-07, + "loss": 0.0004, + "reward": 3.56209397315979, + "reward_std": 0.24439280480146408, + "rewards/final_reward": 1.7721260543580972, + "rewards/mask_iou_reward": 0.8860630271790486, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.562093734741211, + "rewards/thk_ans_format_reward": 1.0, + "step": 514, + "think_completion_length": 68.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.453125, + "epoch": 0.8684654300168634, + "grad_norm": 5.345688348718463, + "kl": 0.373046875, + "learning_rate": 8.263069139966273e-07, + "loss": 0.0004, + "reward": 2.648374915122986, + "reward_std": 0.14371401071548462, + "rewards/final_reward": 0.8322886750975954, + "rewards/mask_iou_reward": 0.4161443375487977, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6483748480677605, + "rewards/thk_ans_format_reward": 1.0, + "step": 515, + "think_completion_length": 63.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 231.734375, + "epoch": 0.8701517706576728, + "grad_norm": 6.266616889593935, + "kl": 0.3095703125, + "learning_rate": 8.259696458684654e-07, + "loss": 0.0003, + "reward": 3.183814764022827, + "reward_std": 0.20516617968678474, + "rewards/final_reward": 1.4832190450462837, + "rewards/mask_iou_reward": 0.7416095225231418, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.199439823627472, + "rewards/thk_ans_format_reward": 1.0, + "step": 516, + "think_completion_length": 63.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.78125, + "epoch": 0.8718381112984823, + "grad_norm": 6.359772663934902, + "kl": 0.396484375, + "learning_rate": 8.256323777403035e-07, + "loss": 0.0004, + "reward": 3.377490758895874, + "reward_std": 0.16431526839733124, + "rewards/final_reward": 1.2387319760179651, + "rewards/mask_iou_reward": 0.6193659880089826, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3774908185005188, + "rewards/thk_ans_format_reward": 1.0, + "step": 517, + "think_completion_length": 58.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.671875, + "epoch": 0.8735244519392917, + "grad_norm": 8.28235082705572, + "kl": 0.392578125, + "learning_rate": 8.252951096121416e-07, + "loss": 0.0004, + "reward": 3.2795443534851074, + "reward_std": 0.2162407599389553, + "rewards/final_reward": 1.4022729958833613, + "rewards/mask_iou_reward": 0.7011364979416806, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2795442342758179, + "rewards/thk_ans_format_reward": 1.0, + "step": 518, + "think_completion_length": 60.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.65625, + "epoch": 0.8752107925801011, + "grad_norm": 16.66235998201711, + "kl": 0.38671875, + "learning_rate": 8.249578414839797e-07, + "loss": 0.0004, + "reward": 3.3192691802978516, + "reward_std": 0.16263797972351313, + "rewards/final_reward": 1.0817492556014303, + "rewards/mask_iou_reward": 0.5408746278007152, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3192691802978516, + "rewards/thk_ans_format_reward": 1.0, + "step": 519, + "think_completion_length": 62.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.21875, + "epoch": 0.8768971332209107, + "grad_norm": 7.827687818773585, + "kl": 0.330078125, + "learning_rate": 8.246205733558178e-07, + "loss": 0.0003, + "reward": 2.7803075313568115, + "reward_std": 0.31534768640995026, + "rewards/final_reward": 0.7727706577683782, + "rewards/mask_iou_reward": 0.3863853288841891, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.7959325313568115, + "rewards/thk_ans_format_reward": 1.0, + "step": 520, + "think_completion_length": 74.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.203125, + "epoch": 0.8785834738617201, + "grad_norm": 4.032912423168109, + "kl": 0.353515625, + "learning_rate": 8.24283305227656e-07, + "loss": 0.0004, + "reward": 3.183266282081604, + "reward_std": 0.2568514347076416, + "rewards/final_reward": 1.327475335566459, + "rewards/mask_iou_reward": 0.6637376677832295, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1832663118839264, + "rewards/thk_ans_format_reward": 1.0, + "step": 521, + "think_completion_length": 65.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.828125, + "epoch": 0.8802698145025295, + "grad_norm": 10.421736927007183, + "kl": 0.3662109375, + "learning_rate": 8.239460370994941e-07, + "loss": 0.0004, + "reward": 3.470924496650696, + "reward_std": 0.10463305935263634, + "rewards/final_reward": 1.0585844424707966, + "rewards/mask_iou_reward": 0.5292922212353983, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4709245562553406, + "rewards/thk_ans_format_reward": 1.0, + "step": 522, + "think_completion_length": 58.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.484375, + "epoch": 0.8819561551433389, + "grad_norm": 5.559082708566229, + "kl": 0.5234375, + "learning_rate": 8.236087689713322e-07, + "loss": 0.0005, + "reward": 3.0396599769592285, + "reward_std": 0.157942034304142, + "rewards/final_reward": 1.1188812180385286, + "rewards/mask_iou_reward": 0.5594406090192643, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0396599769592285, + "rewards/thk_ans_format_reward": 1.0, + "step": 523, + "think_completion_length": 67.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.484375, + "epoch": 0.8836424957841484, + "grad_norm": 6.252306064416424, + "kl": 0.4580078125, + "learning_rate": 8.232715008431703e-07, + "loss": 0.0005, + "reward": 3.2086377143859863, + "reward_std": 0.2545919269323349, + "rewards/final_reward": 1.1811606394896785, + "rewards/mask_iou_reward": 0.5905803197448393, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2086376249790192, + "rewards/thk_ans_format_reward": 1.0, + "step": 524, + "think_completion_length": 64.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.140625, + "epoch": 0.8853288364249579, + "grad_norm": 4.371512130440283, + "kl": 0.5361328125, + "learning_rate": 8.229342327150084e-07, + "loss": 0.0005, + "reward": 3.3854466676712036, + "reward_std": 0.10468383133411407, + "rewards/final_reward": 1.4435736608997933, + "rewards/mask_iou_reward": 0.7217868304498967, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3854466080665588, + "rewards/thk_ans_format_reward": 1.0, + "step": 525, + "think_completion_length": 67.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.875, + "epoch": 0.8870151770657673, + "grad_norm": 5.152020009962662, + "kl": 0.3486328125, + "learning_rate": 8.225969645868464e-07, + "loss": 0.0003, + "reward": 3.441176652908325, + "reward_std": 0.2706274315714836, + "rewards/final_reward": 1.7451023757817863, + "rewards/mask_iou_reward": 0.8725511878908931, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4411765933036804, + "rewards/thk_ans_format_reward": 1.0, + "step": 526, + "think_completion_length": 72.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.625, + "epoch": 0.8887015177065767, + "grad_norm": 21.687079355389578, + "kl": 0.3818359375, + "learning_rate": 8.222596964586846e-07, + "loss": 0.0004, + "reward": 2.9406185150146484, + "reward_std": 0.2899327874183655, + "rewards/final_reward": 0.8891764659295585, + "rewards/mask_iou_reward": 0.44458823296477923, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9406185150146484, + "rewards/thk_ans_format_reward": 1.0, + "step": 527, + "think_completion_length": 60.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.828125, + "epoch": 0.8903878583473862, + "grad_norm": 4.36400370590007, + "kl": 0.375, + "learning_rate": 8.219224283305227e-07, + "loss": 0.0004, + "reward": 2.7132362127304077, + "reward_std": 0.1695428118109703, + "rewards/final_reward": 0.23595122885585143, + "rewards/mask_iou_reward": 0.11797561442792572, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7132362425327301, + "rewards/thk_ans_format_reward": 1.0, + "step": 528, + "think_completion_length": 64.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.65625, + "epoch": 0.8920741989881956, + "grad_norm": 13.437495085356725, + "kl": 0.4208984375, + "learning_rate": 8.215851602023608e-07, + "loss": 0.0004, + "reward": 3.3183414936065674, + "reward_std": 0.2359558790922165, + "rewards/final_reward": 1.1896120618016504, + "rewards/mask_iou_reward": 0.5948060309008252, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3183414936065674, + "rewards/thk_ans_format_reward": 1.0, + "step": 529, + "think_completion_length": 66.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.296875, + "epoch": 0.893760539629005, + "grad_norm": 7.6705294118384275, + "kl": 0.3984375, + "learning_rate": 8.21247892074199e-07, + "loss": 0.0004, + "reward": 3.419031500816345, + "reward_std": 0.2190863024443388, + "rewards/final_reward": 1.1311254608719006, + "rewards/mask_iou_reward": 0.5655627304359503, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.41903156042099, + "rewards/thk_ans_format_reward": 1.0, + "step": 530, + "think_completion_length": 64.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.078125, + "epoch": 0.8954468802698144, + "grad_norm": 20.472054925385102, + "kl": 0.3505859375, + "learning_rate": 8.209106239460371e-07, + "loss": 0.0003, + "reward": 3.381049633026123, + "reward_std": 0.23917018435895443, + "rewards/final_reward": 1.5125118845948695, + "rewards/mask_iou_reward": 0.7562559422974348, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3810496926307678, + "rewards/thk_ans_format_reward": 1.0, + "step": 531, + "think_completion_length": 64.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.328125, + "epoch": 0.897133220910624, + "grad_norm": 3.8168364492030333, + "kl": 0.3720703125, + "learning_rate": 8.205733558178752e-07, + "loss": 0.0003, + "reward": 3.023830771446228, + "reward_std": 0.169752950896509, + "rewards/final_reward": 1.1518864899917212, + "rewards/mask_iou_reward": 0.5759432449958606, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 1.0707058385014534, + "rewards/thk_ans_format_reward": 0.984375, + "step": 532, + "think_completion_length": 64.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.703125, + "epoch": 0.8988195615514334, + "grad_norm": 15.967853474424993, + "kl": 0.384765625, + "learning_rate": 8.202360876897133e-07, + "loss": 0.0004, + "reward": 2.7728532552719116, + "reward_std": 0.139126755297184, + "rewards/final_reward": 0.5571142070550955, + "rewards/mask_iou_reward": 0.27855710352754776, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.772853247821331, + "rewards/thk_ans_format_reward": 1.0, + "step": 533, + "think_completion_length": 76.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.515625, + "epoch": 0.9005059021922428, + "grad_norm": 14.985099224834219, + "kl": 1.0390625, + "learning_rate": 8.198988195615514e-07, + "loss": 0.001, + "reward": 2.8130897283554077, + "reward_std": 0.22141174226999283, + "rewards/final_reward": 0.3782079504236656, + "rewards/mask_iou_reward": 0.1891039752118328, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8130897581577301, + "rewards/thk_ans_format_reward": 1.0, + "step": 534, + "think_completion_length": 65.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.171875, + "epoch": 0.9021922428330523, + "grad_norm": 4.841638655066179, + "kl": 0.3994140625, + "learning_rate": 8.195615514333894e-07, + "loss": 0.0004, + "reward": 2.7947330474853516, + "reward_std": 0.16086240857839584, + "rewards/final_reward": 0.5827494551033152, + "rewards/mask_iou_reward": 0.2913747275516576, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.794733077287674, + "rewards/thk_ans_format_reward": 1.0, + "step": 535, + "think_completion_length": 73.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.234375, + "epoch": 0.9038785834738617, + "grad_norm": 3.6751630699269984, + "kl": 0.4228515625, + "learning_rate": 8.192242833052276e-07, + "loss": 0.0004, + "reward": 3.0260074138641357, + "reward_std": 0.11613265797495842, + "rewards/final_reward": 0.7657504846263233, + "rewards/mask_iou_reward": 0.38287524231316167, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.041632503271103, + "rewards/thk_ans_format_reward": 0.984375, + "step": 536, + "think_completion_length": 64.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.515625, + "epoch": 0.9055649241146712, + "grad_norm": 4.476174693329742, + "kl": 0.453125, + "learning_rate": 8.188870151770657e-07, + "loss": 0.0004, + "reward": 3.5284619331359863, + "reward_std": 0.05958326905965805, + "rewards/final_reward": 1.8054081588094992, + "rewards/mask_iou_reward": 0.9027040794047496, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5284621119499207, + "rewards/thk_ans_format_reward": 1.0, + "step": 537, + "think_completion_length": 65.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.5625, + "epoch": 0.9072512647554806, + "grad_norm": 4.295697569667424, + "kl": 0.369140625, + "learning_rate": 8.185497470489039e-07, + "loss": 0.0004, + "reward": 2.8253973722457886, + "reward_std": 0.17487270198762417, + "rewards/final_reward": 0.36516283126835664, + "rewards/mask_iou_reward": 0.18258141563417832, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8253973722457886, + "rewards/thk_ans_format_reward": 1.0, + "step": 538, + "think_completion_length": 71.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.59375, + "epoch": 0.9089376053962901, + "grad_norm": 12.537328062284256, + "kl": 0.380859375, + "learning_rate": 8.18212478920742e-07, + "loss": 0.0004, + "reward": 3.381394863128662, + "reward_std": 0.16334578022360802, + "rewards/final_reward": 1.6468299087358207, + "rewards/mask_iou_reward": 0.8234149543679103, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.381394863128662, + "rewards/thk_ans_format_reward": 1.0, + "step": 539, + "think_completion_length": 67.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.0625, + "epoch": 0.9106239460370995, + "grad_norm": 5.631609952798637, + "kl": 0.373046875, + "learning_rate": 8.178752107925801e-07, + "loss": 0.0004, + "reward": 3.1018731594085693, + "reward_std": 0.4340359643101692, + "rewards/final_reward": 1.2089573280546717, + "rewards/mask_iou_reward": 0.6044786640273359, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.1487482786178589, + "rewards/thk_ans_format_reward": 0.96875, + "step": 540, + "think_completion_length": 70.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.375, + "epoch": 0.9123102866779089, + "grad_norm": 6.251334882971103, + "kl": 0.3701171875, + "learning_rate": 8.175379426644183e-07, + "loss": 0.0004, + "reward": 3.3850208520889282, + "reward_std": 0.15002886205911636, + "rewards/final_reward": 1.3596994323781733, + "rewards/mask_iou_reward": 0.6798497161890866, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3850208520889282, + "rewards/thk_ans_format_reward": 1.0, + "step": 541, + "think_completion_length": 70.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.5625, + "epoch": 0.9139966273187183, + "grad_norm": 6.1318709306129255, + "kl": 0.3701171875, + "learning_rate": 8.172006745362563e-07, + "loss": 0.0004, + "reward": 3.1013495922088623, + "reward_std": 0.48096051812171936, + "rewards/final_reward": 1.24424121940097, + "rewards/mask_iou_reward": 0.622120609700485, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1013494729995728, + "rewards/thk_ans_format_reward": 1.0, + "step": 542, + "think_completion_length": 67.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.40625, + "epoch": 0.9156829679595279, + "grad_norm": 10.854889940904636, + "kl": 0.298828125, + "learning_rate": 8.168634064080943e-07, + "loss": 0.0003, + "reward": 3.2696839570999146, + "reward_std": 0.17891769856214523, + "rewards/final_reward": 1.592977107171023, + "rewards/mask_iou_reward": 0.7964885535855115, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.269684076309204, + "rewards/thk_ans_format_reward": 1.0, + "step": 543, + "think_completion_length": 70.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.78125, + "epoch": 0.9173693086003373, + "grad_norm": 4.23331281750137, + "kl": 0.369140625, + "learning_rate": 8.165261382799325e-07, + "loss": 0.0004, + "reward": 3.4304239749908447, + "reward_std": 0.1314825750887394, + "rewards/final_reward": 1.207980122650095, + "rewards/mask_iou_reward": 0.6039900613250475, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4304240345954895, + "rewards/thk_ans_format_reward": 1.0, + "step": 544, + "think_completion_length": 70.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.265625, + "epoch": 0.9190556492411467, + "grad_norm": 7.031147279257311, + "kl": 0.3369140625, + "learning_rate": 8.161888701517706e-07, + "loss": 0.0003, + "reward": 3.361212372779846, + "reward_std": 0.21165333688259125, + "rewards/final_reward": 1.2343404362180397, + "rewards/mask_iou_reward": 0.6171702181090198, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3612123727798462, + "rewards/thk_ans_format_reward": 1.0, + "step": 545, + "think_completion_length": 78.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.0, + "epoch": 0.9207419898819561, + "grad_norm": 5.202780347128779, + "kl": 0.3369140625, + "learning_rate": 8.158516020236087e-07, + "loss": 0.0003, + "reward": 3.3819774389266968, + "reward_std": 0.21501677110791206, + "rewards/final_reward": 1.4839641986939198, + "rewards/mask_iou_reward": 0.7419820993469599, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3819775581359863, + "rewards/thk_ans_format_reward": 1.0, + "step": 546, + "think_completion_length": 73.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.484375, + "epoch": 0.9224283305227656, + "grad_norm": 14.149938755345266, + "kl": 0.369140625, + "learning_rate": 8.155143338954469e-07, + "loss": 0.0004, + "reward": 3.032560348510742, + "reward_std": 0.3051258474588394, + "rewards/final_reward": 1.182298730010252, + "rewards/mask_iou_reward": 0.591149365005126, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0325604677200317, + "rewards/thk_ans_format_reward": 1.0, + "step": 547, + "think_completion_length": 79.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.0625, + "epoch": 0.924114671163575, + "grad_norm": 7.103217899009958, + "kl": 0.3955078125, + "learning_rate": 8.15177065767285e-07, + "loss": 0.0004, + "reward": 3.4782878160476685, + "reward_std": 0.29576554894447327, + "rewards/final_reward": 1.7047875744492655, + "rewards/mask_iou_reward": 0.8523937872246328, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4782878160476685, + "rewards/thk_ans_format_reward": 1.0, + "step": 548, + "think_completion_length": 75.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.96875, + "epoch": 0.9258010118043845, + "grad_norm": 4.674635909867585, + "kl": 0.3935546875, + "learning_rate": 8.148397976391231e-07, + "loss": 0.0004, + "reward": 3.0649091005325317, + "reward_std": 0.10538779571652412, + "rewards/final_reward": 1.4885215280987056, + "rewards/mask_iou_reward": 0.7442607640493528, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0649092197418213, + "rewards/thk_ans_format_reward": 1.0, + "step": 549, + "think_completion_length": 77.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.578125, + "epoch": 0.927487352445194, + "grad_norm": 10.824977561332439, + "kl": 0.392578125, + "learning_rate": 8.145025295109613e-07, + "loss": 0.0004, + "reward": 2.901737332344055, + "reward_std": 0.2872447445988655, + "rewards/final_reward": 0.3606779283185213, + "rewards/mask_iou_reward": 0.18033896415926065, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9017373025417328, + "rewards/thk_ans_format_reward": 1.0, + "step": 550, + "think_completion_length": 80.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.140625, + "epoch": 0.9291736930860034, + "grad_norm": 3.335434593783744, + "kl": 0.3525390625, + "learning_rate": 8.141652613827992e-07, + "loss": 0.0004, + "reward": 2.98220694065094, + "reward_std": 0.1780674085021019, + "rewards/final_reward": 0.9759100328842547, + "rewards/mask_iou_reward": 0.48795501644212735, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9822070002555847, + "rewards/thk_ans_format_reward": 1.0, + "step": 551, + "think_completion_length": 86.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.296875, + "epoch": 0.9308600337268128, + "grad_norm": 13.530986743553575, + "kl": 0.3818359375, + "learning_rate": 8.138279932546373e-07, + "loss": 0.0004, + "reward": 3.046547293663025, + "reward_std": 0.38355183601379395, + "rewards/final_reward": 0.5386104314584983, + "rewards/mask_iou_reward": 0.26930521572924915, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0465472042560577, + "rewards/thk_ans_format_reward": 1.0, + "step": 552, + "think_completion_length": 81.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.84375, + "epoch": 0.9325463743676222, + "grad_norm": 7.018348869059321, + "kl": 0.37890625, + "learning_rate": 8.134907251264755e-07, + "loss": 0.0004, + "reward": 3.382186532020569, + "reward_std": 0.1798281967639923, + "rewards/final_reward": 1.0075253248878073, + "rewards/mask_iou_reward": 0.5037626624439037, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3821865320205688, + "rewards/thk_ans_format_reward": 1.0, + "step": 553, + "think_completion_length": 78.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.578125, + "epoch": 0.9342327150084317, + "grad_norm": 6.620793871261661, + "kl": 0.3818359375, + "learning_rate": 8.131534569983136e-07, + "loss": 0.0004, + "reward": 3.33063006401062, + "reward_std": 0.20287129282951355, + "rewards/final_reward": 0.9096661811312425, + "rewards/mask_iou_reward": 0.45483309056562127, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3306299448013306, + "rewards/thk_ans_format_reward": 1.0, + "step": 554, + "think_completion_length": 76.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.890625, + "epoch": 0.9359190556492412, + "grad_norm": 5.218423197482733, + "kl": 0.35546875, + "learning_rate": 8.128161888701517e-07, + "loss": 0.0004, + "reward": 2.9394739866256714, + "reward_std": 0.4641287475824356, + "rewards/final_reward": 0.8743346962894474, + "rewards/mask_iou_reward": 0.4371673481447237, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9394739866256714, + "rewards/thk_ans_format_reward": 1.0, + "step": 555, + "think_completion_length": 84.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.984375, + "epoch": 0.9376053962900506, + "grad_norm": 18.845803308886303, + "kl": 0.3994140625, + "learning_rate": 8.124789207419899e-07, + "loss": 0.0004, + "reward": 3.325919985771179, + "reward_std": 0.30405908077955246, + "rewards/final_reward": 1.2264077496832324, + "rewards/mask_iou_reward": 0.6132038748416162, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.325919896364212, + "rewards/thk_ans_format_reward": 1.0, + "step": 556, + "think_completion_length": 78.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 215.328125, + "epoch": 0.93929173693086, + "grad_norm": 5.499154937677363, + "kl": 0.34765625, + "learning_rate": 8.12141652613828e-07, + "loss": 0.0003, + "reward": 2.9827044010162354, + "reward_std": 0.350845642387867, + "rewards/final_reward": 0.6183570090078176, + "rewards/mask_iou_reward": 0.3091785045039088, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.0139543414115906, + "rewards/thk_ans_format_reward": 0.984375, + "step": 557, + "think_completion_length": 76.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.890625, + "epoch": 0.9409780775716695, + "grad_norm": 8.221926187139507, + "kl": 0.3671875, + "learning_rate": 8.118043844856661e-07, + "loss": 0.0004, + "reward": 3.4161949157714844, + "reward_std": 0.24237601598724723, + "rewards/final_reward": 1.3035412984268884, + "rewards/mask_iou_reward": 0.6517706492134442, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4161949753761292, + "rewards/thk_ans_format_reward": 1.0, + "step": 558, + "think_completion_length": 77.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.078125, + "epoch": 0.9426644182124789, + "grad_norm": 4.669917534704876, + "kl": 0.3662109375, + "learning_rate": 8.114671163575043e-07, + "loss": 0.0004, + "reward": 2.6135934591293335, + "reward_std": 0.38687272369861603, + "rewards/final_reward": 0.2619348523224978, + "rewards/mask_iou_reward": 0.1309674261612489, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6135933995246887, + "rewards/thk_ans_format_reward": 1.0, + "step": 559, + "think_completion_length": 83.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.140625, + "epoch": 0.9443507588532883, + "grad_norm": 5.903891117155913, + "kl": 0.35546875, + "learning_rate": 8.111298482293422e-07, + "loss": 0.0004, + "reward": 3.100181221961975, + "reward_std": 0.18918309919536114, + "rewards/final_reward": 1.320369290484187, + "rewards/mask_iou_reward": 0.6601846452420935, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1001812517642975, + "rewards/thk_ans_format_reward": 1.0, + "step": 560, + "think_completion_length": 88.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.8125, + "epoch": 0.9460370994940978, + "grad_norm": 5.2513254819440265, + "kl": 0.40625, + "learning_rate": 8.107925801011804e-07, + "loss": 0.0004, + "reward": 3.007655143737793, + "reward_std": 0.1493750810623169, + "rewards/final_reward": 1.0473003270407384, + "rewards/mask_iou_reward": 0.5236501635203692, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0076551735401154, + "rewards/thk_ans_format_reward": 1.0, + "step": 561, + "think_completion_length": 80.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.5, + "epoch": 0.9477234401349073, + "grad_norm": 6.63392999354413, + "kl": 0.357421875, + "learning_rate": 8.104553119730185e-07, + "loss": 0.0004, + "reward": 3.241023063659668, + "reward_std": 0.3681875765323639, + "rewards/final_reward": 1.285415503132128, + "rewards/mask_iou_reward": 0.642707751566064, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.2566478848457336, + "rewards/thk_ans_format_reward": 1.0, + "step": 562, + "think_completion_length": 86.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.90625, + "epoch": 0.9494097807757167, + "grad_norm": 5.375862707812532, + "kl": 0.3525390625, + "learning_rate": 8.101180438448566e-07, + "loss": 0.0004, + "reward": 2.9660009145736694, + "reward_std": 0.22842106223106384, + "rewards/final_reward": 0.3887532703580729, + "rewards/mask_iou_reward": 0.19437663517903644, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9660007953643799, + "rewards/thk_ans_format_reward": 1.0, + "step": 563, + "think_completion_length": 77.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.3125, + "epoch": 0.9510961214165261, + "grad_norm": 5.006303669029434, + "kl": 0.685546875, + "learning_rate": 8.097807757166948e-07, + "loss": 0.0007, + "reward": 2.836309552192688, + "reward_std": 0.33161526918411255, + "rewards/final_reward": 0.3801896632615437, + "rewards/mask_iou_reward": 0.19009483163077184, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8363094627857208, + "rewards/thk_ans_format_reward": 1.0, + "step": 564, + "think_completion_length": 80.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.359375, + "epoch": 0.9527824620573356, + "grad_norm": 6.971235012770987, + "kl": 0.369140625, + "learning_rate": 8.094435075885329e-07, + "loss": 0.0004, + "reward": 3.0500903129577637, + "reward_std": 0.21272655948996544, + "rewards/final_reward": 0.5225397620726526, + "rewards/mask_iou_reward": 0.2612698810363263, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0500901937484741, + "rewards/thk_ans_format_reward": 1.0, + "step": 565, + "think_completion_length": 71.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.4375, + "epoch": 0.954468802698145, + "grad_norm": 4.573759651160915, + "kl": 0.375, + "learning_rate": 8.09106239460371e-07, + "loss": 0.0004, + "reward": 2.946573495864868, + "reward_std": 0.24175241217017174, + "rewards/final_reward": 1.4133046233131688, + "rewards/mask_iou_reward": 0.7066523116565844, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9465733766555786, + "rewards/thk_ans_format_reward": 1.0, + "step": 566, + "think_completion_length": 83.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.59375, + "epoch": 0.9561551433389545, + "grad_norm": 7.187812157503615, + "kl": 0.3544921875, + "learning_rate": 8.087689713322092e-07, + "loss": 0.0004, + "reward": 3.17309832572937, + "reward_std": 0.38770583271980286, + "rewards/final_reward": 1.101495376355855, + "rewards/mask_iou_reward": 0.5507476881779275, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1730982661247253, + "rewards/thk_ans_format_reward": 1.0, + "step": 567, + "think_completion_length": 70.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.390625, + "epoch": 0.9578414839797639, + "grad_norm": 7.228857272066258, + "kl": 0.3603515625, + "learning_rate": 8.084317032040471e-07, + "loss": 0.0004, + "reward": 3.3016059398651123, + "reward_std": 0.2917497009038925, + "rewards/final_reward": 1.6633718802454966, + "rewards/mask_iou_reward": 0.8316859401227483, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3016058802604675, + "rewards/thk_ans_format_reward": 1.0, + "step": 568, + "think_completion_length": 74.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.96875, + "epoch": 0.9595278246205734, + "grad_norm": 5.464540876657938, + "kl": 0.3779296875, + "learning_rate": 8.080944350758852e-07, + "loss": 0.0004, + "reward": 2.973666191101074, + "reward_std": 0.5053753107786179, + "rewards/final_reward": 1.1511587511836887, + "rewards/mask_iou_reward": 0.5755793755918444, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9736661314964294, + "rewards/thk_ans_format_reward": 1.0, + "step": 569, + "think_completion_length": 82.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.984375, + "epoch": 0.9612141652613828, + "grad_norm": 7.821684462869653, + "kl": 0.4404296875, + "learning_rate": 8.077571669477234e-07, + "loss": 0.0004, + "reward": 3.028488874435425, + "reward_std": 0.34643781185150146, + "rewards/final_reward": 1.6624882837630195, + "rewards/mask_iou_reward": 0.8312441418815097, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0284889936447144, + "rewards/thk_ans_format_reward": 1.0, + "step": 570, + "think_completion_length": 67.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.71875, + "epoch": 0.9629005059021922, + "grad_norm": 7.693091615022995, + "kl": 0.375, + "learning_rate": 8.074198988195615e-07, + "loss": 0.0004, + "reward": 2.7286545038223267, + "reward_std": 0.30753113329410553, + "rewards/final_reward": 1.0370801433569967, + "rewards/mask_iou_reward": 0.5185400716784984, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7286544442176819, + "rewards/thk_ans_format_reward": 1.0, + "step": 571, + "think_completion_length": 66.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.78125, + "epoch": 0.9645868465430016, + "grad_norm": 9.500808036106628, + "kl": 0.384765625, + "learning_rate": 8.070826306913996e-07, + "loss": 0.0004, + "reward": 2.7350605726242065, + "reward_std": 0.19486035406589508, + "rewards/final_reward": 0.5022729043104245, + "rewards/mask_iou_reward": 0.25113645215521224, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7350606322288513, + "rewards/thk_ans_format_reward": 1.0, + "step": 572, + "think_completion_length": 67.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.625, + "epoch": 0.9662731871838112, + "grad_norm": 14.562869249290125, + "kl": 0.380859375, + "learning_rate": 8.067453625632378e-07, + "loss": 0.0004, + "reward": 3.1157146692276, + "reward_std": 0.21578150242567062, + "rewards/final_reward": 1.424389242246907, + "rewards/mask_iou_reward": 0.7121946211234536, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1313397586345673, + "rewards/thk_ans_format_reward": 0.984375, + "step": 573, + "think_completion_length": 69.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.921875, + "epoch": 0.9679595278246206, + "grad_norm": 4.906234454731228, + "kl": 0.396484375, + "learning_rate": 8.064080944350759e-07, + "loss": 0.0004, + "reward": 2.532925605773926, + "reward_std": 0.20672988891601562, + "rewards/final_reward": 0.9013923587631087, + "rewards/mask_iou_reward": 0.6365832389192185, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5329254902899265, + "rewards/thk_ans_format_reward": 1.0, + "step": 574, + "think_completion_length": 66.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.46875, + "epoch": 0.96964586846543, + "grad_norm": 11.348233863362653, + "kl": 0.453125, + "learning_rate": 8.06070826306914e-07, + "loss": 0.0005, + "reward": 2.809985041618347, + "reward_std": 0.24615808576345444, + "rewards/final_reward": 1.39664637338225, + "rewards/mask_iou_reward": 0.698323186691125, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.8256100416183472, + "rewards/thk_ans_format_reward": 1.0, + "step": 575, + "think_completion_length": 69.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.09375, + "epoch": 0.9713322091062394, + "grad_norm": 5.294065865305967, + "kl": 0.3916015625, + "learning_rate": 8.057335581787521e-07, + "loss": 0.0004, + "reward": 3.430790901184082, + "reward_std": 0.043369969353079796, + "rewards/final_reward": 1.1329873898201583, + "rewards/mask_iou_reward": 0.5664936949100792, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.430790901184082, + "rewards/thk_ans_format_reward": 1.0, + "step": 576, + "think_completion_length": 66.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.59375, + "epoch": 0.9730185497470489, + "grad_norm": 8.854015009866645, + "kl": 0.375, + "learning_rate": 8.053962900505901e-07, + "loss": 0.0004, + "reward": 3.2189637422561646, + "reward_std": 0.43901485204696655, + "rewards/final_reward": 1.0151315489185706, + "rewards/mask_iou_reward": 0.5075657744592853, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2189638018608093, + "rewards/thk_ans_format_reward": 1.0, + "step": 577, + "think_completion_length": 68.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.546875, + "epoch": 0.9747048903878583, + "grad_norm": 4.261687082002958, + "kl": 0.392578125, + "learning_rate": 8.050590219224282e-07, + "loss": 0.0004, + "reward": 3.3709945678710938, + "reward_std": 0.14033617079257965, + "rewards/final_reward": 1.3439573249012957, + "rewards/mask_iou_reward": 0.6719786624506479, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3709946870803833, + "rewards/thk_ans_format_reward": 1.0, + "step": 578, + "think_completion_length": 66.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.109375, + "epoch": 0.9763912310286678, + "grad_norm": 4.447207228653514, + "kl": 0.388671875, + "learning_rate": 8.047217537942664e-07, + "loss": 0.0004, + "reward": 3.2281277179718018, + "reward_std": 0.14848940074443817, + "rewards/final_reward": 0.8529733127401289, + "rewards/mask_iou_reward": 0.42648665637006444, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2281277775764465, + "rewards/thk_ans_format_reward": 1.0, + "step": 579, + "think_completion_length": 68.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.671875, + "epoch": 0.9780775716694773, + "grad_norm": 4.781897329353487, + "kl": 0.392578125, + "learning_rate": 8.043844856661045e-07, + "loss": 0.0004, + "reward": 3.032002568244934, + "reward_std": 0.18674946948885918, + "rewards/final_reward": 1.2685883990484217, + "rewards/mask_iou_reward": 0.6342941995242108, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0320026278495789, + "rewards/thk_ans_format_reward": 1.0, + "step": 580, + "think_completion_length": 62.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.578125, + "epoch": 0.9797639123102867, + "grad_norm": 5.322720152197539, + "kl": 0.3583984375, + "learning_rate": 8.040472175379426e-07, + "loss": 0.0004, + "reward": 3.2712482213974, + "reward_std": 0.49656783044338226, + "rewards/final_reward": 1.682521514658056, + "rewards/mask_iou_reward": 0.841260757329028, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2712482213974, + "rewards/thk_ans_format_reward": 1.0, + "step": 581, + "think_completion_length": 67.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.59375, + "epoch": 0.9814502529510961, + "grad_norm": 12.090539903297808, + "kl": 0.38671875, + "learning_rate": 8.037099494097808e-07, + "loss": 0.0004, + "reward": 3.437628746032715, + "reward_std": 0.28335001319646835, + "rewards/final_reward": 1.5924873047351613, + "rewards/mask_iou_reward": 0.7962436523675807, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.4532538652420044, + "rewards/thk_ans_format_reward": 1.0, + "step": 582, + "think_completion_length": 70.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.765625, + "epoch": 0.9831365935919055, + "grad_norm": 9.381927842657715, + "kl": 0.35546875, + "learning_rate": 8.033726812816189e-07, + "loss": 0.0004, + "reward": 3.397608757019043, + "reward_std": 0.22774043679237366, + "rewards/final_reward": 1.1310353021720032, + "rewards/mask_iou_reward": 0.5655176510860016, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3976088762283325, + "rewards/thk_ans_format_reward": 1.0, + "step": 583, + "think_completion_length": 65.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.34375, + "epoch": 0.984822934232715, + "grad_norm": 5.605649395831915, + "kl": 0.38671875, + "learning_rate": 8.030354131534569e-07, + "loss": 0.0004, + "reward": 3.2333011627197266, + "reward_std": 0.145121393725276, + "rewards/final_reward": 1.088090215710324, + "rewards/mask_iou_reward": 0.544045107855162, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2489261627197266, + "rewards/thk_ans_format_reward": 0.984375, + "step": 584, + "think_completion_length": 64.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.0625, + "epoch": 0.9865092748735245, + "grad_norm": 8.106226266357831, + "kl": 0.3798828125, + "learning_rate": 8.02698145025295e-07, + "loss": 0.0004, + "reward": 3.3262200355529785, + "reward_std": 0.04663046449422836, + "rewards/final_reward": 1.5821852953472084, + "rewards/mask_iou_reward": 0.7910926476736042, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3262198567390442, + "rewards/thk_ans_format_reward": 1.0, + "step": 585, + "think_completion_length": 68.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.375, + "epoch": 0.9881956155143339, + "grad_norm": 12.60392616979475, + "kl": 0.482421875, + "learning_rate": 8.023608768971331e-07, + "loss": 0.0005, + "reward": 3.0508854389190674, + "reward_std": 0.16749375313520432, + "rewards/final_reward": 0.5494016521565961, + "rewards/mask_iou_reward": 0.27470082607829804, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.050885260105133, + "rewards/thk_ans_format_reward": 1.0, + "step": 586, + "think_completion_length": 60.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.0, + "epoch": 0.9898819561551433, + "grad_norm": 4.6943230545014885, + "kl": 0.4091796875, + "learning_rate": 8.020236087689713e-07, + "loss": 0.0004, + "reward": 3.140958547592163, + "reward_std": 0.11254860181361437, + "rewards/final_reward": 1.7400074032806145, + "rewards/mask_iou_reward": 0.8700037016403073, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1409586668014526, + "rewards/thk_ans_format_reward": 1.0, + "step": 587, + "think_completion_length": 60.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.6875, + "epoch": 0.9915682967959528, + "grad_norm": 5.5450739698049, + "kl": 0.3984375, + "learning_rate": 8.016863406408094e-07, + "loss": 0.0004, + "reward": 2.947129249572754, + "reward_std": 0.23405615240335464, + "rewards/final_reward": 1.3299888408969076, + "rewards/mask_iou_reward": 0.6649944204484538, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9471292495727539, + "rewards/thk_ans_format_reward": 1.0, + "step": 588, + "think_completion_length": 68.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.046875, + "epoch": 0.9932546374367622, + "grad_norm": 5.584845044253345, + "kl": 0.4248046875, + "learning_rate": 8.013490725126475e-07, + "loss": 0.0004, + "reward": 2.913410186767578, + "reward_std": 0.18985669524408877, + "rewards/final_reward": 0.5123443019232624, + "rewards/mask_iou_reward": 0.2561721509616312, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9134101867675781, + "rewards/thk_ans_format_reward": 1.0, + "step": 589, + "think_completion_length": 64.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.65625, + "epoch": 0.9949409780775716, + "grad_norm": 12.922202701045435, + "kl": 0.435546875, + "learning_rate": 8.010118043844857e-07, + "loss": 0.0004, + "reward": 2.93517804145813, + "reward_std": 0.2607208490371704, + "rewards/final_reward": 0.5130544114157121, + "rewards/mask_iou_reward": 0.25652720570785603, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9351779818534851, + "rewards/thk_ans_format_reward": 1.0, + "step": 590, + "think_completion_length": 69.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.171875, + "epoch": 0.9966273187183811, + "grad_norm": 4.585499227421089, + "kl": 0.5078125, + "learning_rate": 8.006745362563238e-07, + "loss": 0.0005, + "reward": 3.003074288368225, + "reward_std": 0.21296508610248566, + "rewards/final_reward": 1.2166292128694447, + "rewards/mask_iou_reward": 0.6083146064347223, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.003074288368225, + "rewards/thk_ans_format_reward": 1.0, + "step": 591, + "think_completion_length": 64.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.50000762939453, + "epoch": 0.9983136593591906, + "grad_norm": 4.028094910756302, + "kl": 0.453125, + "learning_rate": 8.003372681281619e-07, + "loss": 0.0005, + "reward": 2.4843724966049194, + "reward_std": 0.13876148965209723, + "rewards/final_reward": 0.5210704072103204, + "rewards/mask_iou_reward": 0.2605352036051602, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.48437248170375824, + "rewards/thk_ans_format_reward": 1.0, + "step": 592, + "think_completion_length": 61.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.203125, + "epoch": 1.0016863406408094, + "grad_norm": 6.134430951701791, + "kl": 0.37890625, + "learning_rate": 8e-07, + "loss": 0.0004, + "reward": 3.165682554244995, + "reward_std": 0.2925257980823517, + "rewards/final_reward": 0.8199532475510345, + "rewards/mask_iou_reward": 0.40997662377551725, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.1813074350357056, + "rewards/thk_ans_format_reward": 1.0, + "step": 593, + "think_completion_length": 58.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.171875, + "epoch": 1.0033726812816188, + "grad_norm": 5.720605963148865, + "kl": 0.376953125, + "learning_rate": 7.99662731871838e-07, + "loss": 0.0004, + "reward": 2.8060446977615356, + "reward_std": 0.08986183628439903, + "rewards/final_reward": 0.6783671900048, + "rewards/mask_iou_reward": 0.3391835950024, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.806044727563858, + "rewards/thk_ans_format_reward": 1.0, + "step": 594, + "think_completion_length": 69.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.59375, + "epoch": 1.0050590219224282, + "grad_norm": 5.326609065448382, + "kl": 0.4091796875, + "learning_rate": 7.993254637436761e-07, + "loss": 0.0004, + "reward": 3.6479886770248413, + "reward_std": 0.04161073174327612, + "rewards/final_reward": 1.7347050746666608, + "rewards/mask_iou_reward": 0.8673525373333304, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6479886770248413, + "rewards/thk_ans_format_reward": 1.0, + "step": 595, + "think_completion_length": 58.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.40625, + "epoch": 1.0067453625632379, + "grad_norm": 4.209252478789583, + "kl": 0.49609375, + "learning_rate": 7.989881956155143e-07, + "loss": 0.0005, + "reward": 3.5398935079574585, + "reward_std": 0.20502007007598877, + "rewards/final_reward": 1.4636818424143263, + "rewards/mask_iou_reward": 0.7318409212071632, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.539893388748169, + "rewards/thk_ans_format_reward": 1.0, + "step": 596, + "think_completion_length": 62.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.296875, + "epoch": 1.0084317032040473, + "grad_norm": 3.5878805024775158, + "kl": 0.359375, + "learning_rate": 7.986509274873524e-07, + "loss": 0.0004, + "reward": 3.013667583465576, + "reward_std": 0.2156200110912323, + "rewards/final_reward": 0.5815063242492908, + "rewards/mask_iou_reward": 0.2907531621246454, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0136675834655762, + "rewards/thk_ans_format_reward": 1.0, + "step": 597, + "think_completion_length": 66.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.03125, + "epoch": 1.0101180438448567, + "grad_norm": 10.09366079770441, + "kl": 0.400390625, + "learning_rate": 7.983136593591905e-07, + "loss": 0.0004, + "reward": 3.603295087814331, + "reward_std": 0.24045547097921371, + "rewards/final_reward": 1.5487568914734149, + "rewards/mask_iou_reward": 0.7743784457367074, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.603295087814331, + "rewards/thk_ans_format_reward": 1.0, + "step": 598, + "think_completion_length": 67.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.390625, + "epoch": 1.0118043844856661, + "grad_norm": 27.303257749207962, + "kl": 0.3798828125, + "learning_rate": 7.979763912310287e-07, + "loss": 0.0004, + "reward": 3.058853507041931, + "reward_std": 0.2468905746936798, + "rewards/final_reward": 0.7482793830128771, + "rewards/mask_iou_reward": 0.37413969150643855, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.058853566646576, + "rewards/thk_ans_format_reward": 1.0, + "step": 599, + "think_completion_length": 63.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.140625, + "epoch": 1.0134907251264755, + "grad_norm": 8.621046672564987, + "kl": 0.388671875, + "learning_rate": 7.976391231028668e-07, + "loss": 0.0004, + "reward": 2.5445148944854736, + "reward_std": 0.23673780262470245, + "rewards/final_reward": 0.2954952263858441, + "rewards/mask_iou_reward": 0.14774761319292204, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5445149838924408, + "rewards/thk_ans_format_reward": 1.0, + "step": 600, + "think_completion_length": 62.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.6875, + "epoch": 1.015177065767285, + "grad_norm": 11.444735510612778, + "kl": 0.365234375, + "learning_rate": 7.973018549747048e-07, + "loss": 0.0004, + "reward": 3.351117491722107, + "reward_std": 0.16388334333896637, + "rewards/final_reward": 1.430642644523778, + "rewards/mask_iou_reward": 0.715321322261889, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.351117491722107, + "rewards/thk_ans_format_reward": 1.0, + "step": 601, + "think_completion_length": 60.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.546875, + "epoch": 1.0168634064080944, + "grad_norm": 16.50112903301435, + "kl": 0.3955078125, + "learning_rate": 7.96964586846543e-07, + "loss": 0.0004, + "reward": 3.4486085176467896, + "reward_std": 0.07140736281871796, + "rewards/final_reward": 1.781127874725175, + "rewards/mask_iou_reward": 0.8905639373625875, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4486083388328552, + "rewards/thk_ans_format_reward": 1.0, + "step": 602, + "think_completion_length": 59.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.640625, + "epoch": 1.0185497470489038, + "grad_norm": 3.386400756178599, + "kl": 0.41015625, + "learning_rate": 7.96627318718381e-07, + "loss": 0.0004, + "reward": 3.1761107444763184, + "reward_std": 0.19732992816716433, + "rewards/final_reward": 0.6813215000785604, + "rewards/mask_iou_reward": 0.3406607500392802, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.176110714673996, + "rewards/thk_ans_format_reward": 1.0, + "step": 603, + "think_completion_length": 58.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.984375, + "epoch": 1.0202360876897134, + "grad_norm": 6.050376141836877, + "kl": 0.40625, + "learning_rate": 7.962900505902191e-07, + "loss": 0.0004, + "reward": 3.3012163639068604, + "reward_std": 0.2960309898480773, + "rewards/final_reward": 1.299787516927388, + "rewards/mask_iou_reward": 0.649893758463694, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3012162744998932, + "rewards/thk_ans_format_reward": 1.0, + "step": 604, + "think_completion_length": 59.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.40625, + "epoch": 1.0219224283305228, + "grad_norm": 8.792295259990127, + "kl": 0.439453125, + "learning_rate": 7.959527824620573e-07, + "loss": 0.0004, + "reward": 3.051405191421509, + "reward_std": 0.2848479002714157, + "rewards/final_reward": 1.513040004001537, + "rewards/mask_iou_reward": 0.7565200020007685, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0514051914215088, + "rewards/thk_ans_format_reward": 1.0, + "step": 605, + "think_completion_length": 54.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.34375, + "epoch": 1.0236087689713322, + "grad_norm": 9.924707938769188, + "kl": 1.1298828125, + "learning_rate": 7.956155143338954e-07, + "loss": 0.0011, + "reward": 3.635921359062195, + "reward_std": 0.23812589421868324, + "rewards/final_reward": 1.5734499780355082, + "rewards/mask_iou_reward": 0.7867249890177541, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.63592129945755, + "rewards/thk_ans_format_reward": 1.0, + "step": 606, + "think_completion_length": 63.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.34375, + "epoch": 1.0252951096121417, + "grad_norm": 5.650344995506986, + "kl": 0.400390625, + "learning_rate": 7.952782462057335e-07, + "loss": 0.0004, + "reward": 2.9694302082061768, + "reward_std": 0.4676559269428253, + "rewards/final_reward": 1.1528093681017233, + "rewards/mask_iou_reward": 0.5764046840508616, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9694302976131439, + "rewards/thk_ans_format_reward": 1.0, + "step": 607, + "think_completion_length": 58.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.375, + "epoch": 1.026981450252951, + "grad_norm": 43.31806523897635, + "kl": 0.40234375, + "learning_rate": 7.949409780775717e-07, + "loss": 0.0005, + "reward": 3.5301413536071777, + "reward_std": 0.11195811629295349, + "rewards/final_reward": 1.2349649014730204, + "rewards/mask_iou_reward": 0.6174824507365102, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5301413536071777, + "rewards/thk_ans_format_reward": 1.0, + "step": 608, + "think_completion_length": 60.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.921875, + "epoch": 1.0286677908937605, + "grad_norm": 4.963350314889085, + "kl": 0.365234375, + "learning_rate": 7.946037099494097e-07, + "loss": 0.0004, + "reward": 3.425765633583069, + "reward_std": 0.1056961864233017, + "rewards/final_reward": 1.2456830820121185, + "rewards/mask_iou_reward": 0.6228415410060593, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4257656931877136, + "rewards/thk_ans_format_reward": 1.0, + "step": 609, + "think_completion_length": 64.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.8125, + "epoch": 1.03035413153457, + "grad_norm": 6.993123062729114, + "kl": 0.39453125, + "learning_rate": 7.942664418212478e-07, + "loss": 0.0004, + "reward": 2.9795360565185547, + "reward_std": 0.16955439560115337, + "rewards/final_reward": 0.9026098214107283, + "rewards/mask_iou_reward": 0.45130491070536416, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9795358777046204, + "rewards/thk_ans_format_reward": 1.0, + "step": 610, + "think_completion_length": 69.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.453125, + "epoch": 1.0320404721753795, + "grad_norm": 5.435356630552201, + "kl": 0.4072265625, + "learning_rate": 7.93929173693086e-07, + "loss": 0.0004, + "reward": 3.634620428085327, + "reward_std": 0.12156452983617783, + "rewards/final_reward": 1.7145997847390988, + "rewards/mask_iou_reward": 0.8572998923695494, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6346204280853271, + "rewards/thk_ans_format_reward": 1.0, + "step": 611, + "think_completion_length": 62.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.046875, + "epoch": 1.033726812816189, + "grad_norm": 5.7166593319996695, + "kl": 0.376953125, + "learning_rate": 7.93591905564924e-07, + "loss": 0.0004, + "reward": 3.13163959980011, + "reward_std": 0.40263403952121735, + "rewards/final_reward": 1.020172167459069, + "rewards/mask_iou_reward": 0.5100860837295345, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1316396296024323, + "rewards/thk_ans_format_reward": 1.0, + "step": 612, + "think_completion_length": 64.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.921875, + "epoch": 1.0354131534569984, + "grad_norm": 8.067962248791533, + "kl": 0.3857421875, + "learning_rate": 7.932546374367622e-07, + "loss": 0.0004, + "reward": 3.3966641426086426, + "reward_std": 0.056119462475180626, + "rewards/final_reward": 1.7494821178904116, + "rewards/mask_iou_reward": 0.8747410589452058, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3966641426086426, + "rewards/thk_ans_format_reward": 1.0, + "step": 613, + "think_completion_length": 59.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.34375, + "epoch": 1.0370994940978078, + "grad_norm": 7.981542131433182, + "kl": 0.42578125, + "learning_rate": 7.929173693086003e-07, + "loss": 0.0004, + "reward": 3.305042862892151, + "reward_std": 0.09254418313503265, + "rewards/final_reward": 1.3203188567393185, + "rewards/mask_iou_reward": 0.6601594283696592, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3050429224967957, + "rewards/thk_ans_format_reward": 1.0, + "step": 614, + "think_completion_length": 58.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.359375, + "epoch": 1.0387858347386172, + "grad_norm": 30.331721779804514, + "kl": 0.37890625, + "learning_rate": 7.925801011804384e-07, + "loss": 0.0005, + "reward": 3.3478095531463623, + "reward_std": 0.12014642171561718, + "rewards/final_reward": 1.7833511407636402, + "rewards/mask_iou_reward": 0.8916755703818201, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3478095531463623, + "rewards/thk_ans_format_reward": 1.0, + "step": 615, + "think_completion_length": 65.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.453125, + "epoch": 1.0404721753794266, + "grad_norm": 7.313904503197525, + "kl": 0.3759765625, + "learning_rate": 7.922428330522766e-07, + "loss": 0.0004, + "reward": 3.3215017318725586, + "reward_std": 0.3130381852388382, + "rewards/final_reward": 0.9915974087722843, + "rewards/mask_iou_reward": 0.49579870438614215, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3215016722679138, + "rewards/thk_ans_format_reward": 1.0, + "step": 616, + "think_completion_length": 62.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.671875, + "epoch": 1.042158516020236, + "grad_norm": 3.2835782882643687, + "kl": 0.3955078125, + "learning_rate": 7.919055649241147e-07, + "loss": 0.0004, + "reward": 3.607258439064026, + "reward_std": 0.09734362363815308, + "rewards/final_reward": 1.5118354327932084, + "rewards/mask_iou_reward": 0.7559177163966042, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6072584390640259, + "rewards/thk_ans_format_reward": 1.0, + "step": 617, + "think_completion_length": 53.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.453125, + "epoch": 1.0438448566610454, + "grad_norm": 12.442324301753523, + "kl": 0.4423828125, + "learning_rate": 7.915682967959527e-07, + "loss": 0.0004, + "reward": 3.0813547372817993, + "reward_std": 0.25647109746932983, + "rewards/final_reward": 0.8053913152295082, + "rewards/mask_iou_reward": 0.4026956576147541, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0813546776771545, + "rewards/thk_ans_format_reward": 1.0, + "step": 618, + "think_completion_length": 55.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.21875, + "epoch": 1.045531197301855, + "grad_norm": 4.388943716983332, + "kl": 0.44140625, + "learning_rate": 7.912310286677909e-07, + "loss": 0.0004, + "reward": 3.1991811990737915, + "reward_std": 0.06475630914792418, + "rewards/final_reward": 1.6401812516112466, + "rewards/mask_iou_reward": 0.8200906258056233, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1991811692714691, + "rewards/thk_ans_format_reward": 1.0, + "step": 619, + "think_completion_length": 55.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.640625, + "epoch": 1.0472175379426645, + "grad_norm": 6.518516562698951, + "kl": 0.4638671875, + "learning_rate": 7.90893760539629e-07, + "loss": 0.0005, + "reward": 3.4357359409332275, + "reward_std": 0.19499383866786957, + "rewards/final_reward": 0.9589320952628577, + "rewards/mask_iou_reward": 0.47946604763142886, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4357359409332275, + "rewards/thk_ans_format_reward": 1.0, + "step": 620, + "think_completion_length": 59.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.78125, + "epoch": 1.048903878583474, + "grad_norm": 3.5479767787749883, + "kl": 0.37109375, + "learning_rate": 7.90556492411467e-07, + "loss": 0.0004, + "reward": 3.5422295331954956, + "reward_std": 0.20139742642641068, + "rewards/final_reward": 1.6928670840316493, + "rewards/mask_iou_reward": 0.8464335420158247, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.54222971200943, + "rewards/thk_ans_format_reward": 1.0, + "step": 621, + "think_completion_length": 60.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.171875, + "epoch": 1.0505902192242833, + "grad_norm": 33.272162033127636, + "kl": 0.3916015625, + "learning_rate": 7.902192242833052e-07, + "loss": 0.0004, + "reward": 2.83156681060791, + "reward_std": 0.21725196577608585, + "rewards/final_reward": 0.8005620578474413, + "rewards/mask_iou_reward": 0.40028102892372064, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8315666913986206, + "rewards/thk_ans_format_reward": 1.0, + "step": 622, + "think_completion_length": 60.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.578125, + "epoch": 1.0522765598650927, + "grad_norm": 8.5697470890475, + "kl": 0.505859375, + "learning_rate": 7.898819561551433e-07, + "loss": 0.0005, + "reward": 3.144253969192505, + "reward_std": 0.17730970680713654, + "rewards/final_reward": 1.4364145312940275, + "rewards/mask_iou_reward": 0.7182072656470138, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.14425390958786, + "rewards/thk_ans_format_reward": 1.0, + "step": 623, + "think_completion_length": 56.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.84375, + "epoch": 1.0539629005059021, + "grad_norm": 13.854537021856972, + "kl": 0.501953125, + "learning_rate": 7.895446880269814e-07, + "loss": 0.0005, + "reward": 2.9286797046661377, + "reward_std": 0.2590184882283211, + "rewards/final_reward": 0.7867915929348831, + "rewards/mask_iou_reward": 0.39339579646744155, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9286799728870392, + "rewards/thk_ans_format_reward": 1.0, + "step": 624, + "think_completion_length": 54.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.34375, + "epoch": 1.0556492411467115, + "grad_norm": 8.146735804112318, + "kl": 0.38671875, + "learning_rate": 7.892074198988196e-07, + "loss": 0.0004, + "reward": 3.0838944911956787, + "reward_std": 0.1183428168296814, + "rewards/final_reward": 1.2904961257823027, + "rewards/mask_iou_reward": 0.6452480628911513, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0838944911956787, + "rewards/thk_ans_format_reward": 1.0, + "step": 625, + "think_completion_length": 55.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.234375, + "epoch": 1.0573355817875212, + "grad_norm": 20.97851972202258, + "kl": 0.3642578125, + "learning_rate": 7.888701517706576e-07, + "loss": 0.0004, + "reward": 3.509265661239624, + "reward_std": 0.08183963038027287, + "rewards/final_reward": 1.1006403752958818, + "rewards/mask_iou_reward": 0.5503201876479409, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.509265661239624, + "rewards/thk_ans_format_reward": 1.0, + "step": 626, + "think_completion_length": 57.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.375, + "epoch": 1.0590219224283306, + "grad_norm": 7.9849582432385855, + "kl": 0.4345703125, + "learning_rate": 7.885328836424957e-07, + "loss": 0.0004, + "reward": 3.200868010520935, + "reward_std": 0.2000262811779976, + "rewards/final_reward": 1.1624054937062296, + "rewards/mask_iou_reward": 0.5812027468531148, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.216493010520935, + "rewards/thk_ans_format_reward": 0.984375, + "step": 627, + "think_completion_length": 60.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.328125, + "epoch": 1.06070826306914, + "grad_norm": 8.670323596103566, + "kl": 0.392578125, + "learning_rate": 7.881956155143339e-07, + "loss": 0.0004, + "reward": 3.0339386463165283, + "reward_std": 0.07193895429372787, + "rewards/final_reward": 0.7449650734008929, + "rewards/mask_iou_reward": 0.37248253670044645, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0339386463165283, + "rewards/thk_ans_format_reward": 1.0, + "step": 628, + "think_completion_length": 53.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.671875, + "epoch": 1.0623946037099494, + "grad_norm": 6.489276755856646, + "kl": 0.41015625, + "learning_rate": 7.87858347386172e-07, + "loss": 0.0004, + "reward": 3.4266610145568848, + "reward_std": 0.335693646222353, + "rewards/final_reward": 1.6146912092898873, + "rewards/mask_iou_reward": 0.8073456046449436, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.42666095495224, + "rewards/thk_ans_format_reward": 1.0, + "step": 629, + "think_completion_length": 54.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.71875, + "epoch": 1.0640809443507588, + "grad_norm": 16.97609842909034, + "kl": 0.404296875, + "learning_rate": 7.8752107925801e-07, + "loss": 0.0004, + "reward": 3.035442352294922, + "reward_std": 0.39801885560154915, + "rewards/final_reward": 1.6349356163559836, + "rewards/mask_iou_reward": 0.8174678081779918, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 1.0979422330856323, + "rewards/thk_ans_format_reward": 0.96875, + "step": 630, + "think_completion_length": 52.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.578125, + "epoch": 1.0657672849915683, + "grad_norm": 15.181511428300269, + "kl": 0.580078125, + "learning_rate": 7.871838111298482e-07, + "loss": 0.0006, + "reward": 3.3145382404327393, + "reward_std": 0.1567831113934517, + "rewards/final_reward": 0.7586362291775279, + "rewards/mask_iou_reward": 0.37931811458876397, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3145381212234497, + "rewards/thk_ans_format_reward": 1.0, + "step": 631, + "think_completion_length": 51.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.453125, + "epoch": 1.0674536256323777, + "grad_norm": 9.106384318124793, + "kl": 0.380859375, + "learning_rate": 7.868465430016863e-07, + "loss": 0.0004, + "reward": 3.314599871635437, + "reward_std": 0.1469927802681923, + "rewards/final_reward": 1.1458556851733492, + "rewards/mask_iou_reward": 0.5729278425866746, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3145997822284698, + "rewards/thk_ans_format_reward": 1.0, + "step": 632, + "think_completion_length": 54.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.375, + "epoch": 1.069139966273187, + "grad_norm": 7.79880255821729, + "kl": 0.9228515625, + "learning_rate": 7.865092748735244e-07, + "loss": 0.0009, + "reward": 3.261892080307007, + "reward_std": 0.11794530600309372, + "rewards/final_reward": 1.251106499385277, + "rewards/mask_iou_reward": 0.6255532496926385, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2618920803070068, + "rewards/thk_ans_format_reward": 1.0, + "step": 633, + "think_completion_length": 51.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.234375, + "epoch": 1.0708263069139967, + "grad_norm": 4.452227059679726, + "kl": 0.48828125, + "learning_rate": 7.861720067453625e-07, + "loss": 0.0005, + "reward": 3.0262283086776733, + "reward_std": 0.09251783415675163, + "rewards/final_reward": 1.2590802333169506, + "rewards/mask_iou_reward": 0.6295401166584753, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0262282192707062, + "rewards/thk_ans_format_reward": 1.0, + "step": 634, + "think_completion_length": 47.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.0625, + "epoch": 1.0725126475548061, + "grad_norm": 39.93271843995892, + "kl": 0.552734375, + "learning_rate": 7.858347386172006e-07, + "loss": 0.0006, + "reward": 3.4057939052581787, + "reward_std": 0.12188607268035412, + "rewards/final_reward": 1.7272153940200017, + "rewards/mask_iou_reward": 0.8636076970100008, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4057939052581787, + "rewards/thk_ans_format_reward": 1.0, + "step": 635, + "think_completion_length": 50.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.78125, + "epoch": 1.0741989881956155, + "grad_norm": 39.57929332449854, + "kl": 0.6142578125, + "learning_rate": 7.854974704890388e-07, + "loss": 0.0006, + "reward": 3.00014066696167, + "reward_std": 0.4901411384344101, + "rewards/final_reward": 1.0535258352398766, + "rewards/mask_iou_reward": 0.5267629176199383, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.0157656520605087, + "rewards/thk_ans_format_reward": 1.0, + "step": 636, + "think_completion_length": 50.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.453125, + "epoch": 1.075885328836425, + "grad_norm": 3.306093363331697, + "kl": 0.384765625, + "learning_rate": 7.851602023608769e-07, + "loss": 0.0004, + "reward": 2.886072278022766, + "reward_std": 0.10245025204494596, + "rewards/final_reward": 1.2185875798915173, + "rewards/mask_iou_reward": 0.6092937899457587, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8860722482204437, + "rewards/thk_ans_format_reward": 1.0, + "step": 637, + "think_completion_length": 49.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.359375, + "epoch": 1.0775716694772344, + "grad_norm": 279.3506833741868, + "kl": 0.427734375, + "learning_rate": 7.84822934232715e-07, + "loss": 0.0004, + "reward": 3.6662213802337646, + "reward_std": 0.19164992403239012, + "rewards/final_reward": 1.7271686355598388, + "rewards/mask_iou_reward": 0.8635843177799194, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6662213802337646, + "rewards/thk_ans_format_reward": 1.0, + "step": 638, + "think_completion_length": 45.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.65625, + "epoch": 1.0792580101180438, + "grad_norm": 6.750675311209432, + "kl": 0.408203125, + "learning_rate": 7.844856661045531e-07, + "loss": 0.0004, + "reward": 3.471511483192444, + "reward_std": 0.27441119961440563, + "rewards/final_reward": 1.466517628389615, + "rewards/mask_iou_reward": 0.7332588141948075, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.4871366024017334, + "rewards/thk_ans_format_reward": 1.0, + "step": 639, + "think_completion_length": 50.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.53125, + "epoch": 1.0809443507588532, + "grad_norm": 21.646709986864188, + "kl": 0.4453125, + "learning_rate": 7.841483979763912e-07, + "loss": 0.0004, + "reward": 2.989328145980835, + "reward_std": 0.09891379065811634, + "rewards/final_reward": 1.0614732894732377, + "rewards/mask_iou_reward": 0.5307366447366189, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9893282353878021, + "rewards/thk_ans_format_reward": 1.0, + "step": 640, + "think_completion_length": 48.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.421875, + "epoch": 1.0826306913996628, + "grad_norm": 5.5497035358072635, + "kl": 0.451171875, + "learning_rate": 7.838111298482293e-07, + "loss": 0.0005, + "reward": 3.1036102771759033, + "reward_std": 0.06228892970830202, + "rewards/final_reward": 0.4459223647664925, + "rewards/mask_iou_reward": 0.22296118238324625, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1036102771759033, + "rewards/thk_ans_format_reward": 1.0, + "step": 641, + "think_completion_length": 45.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.8125, + "epoch": 1.0843170320404723, + "grad_norm": 61.0294153640609, + "kl": 0.431640625, + "learning_rate": 7.834738617200675e-07, + "loss": 0.0004, + "reward": 2.8523221015930176, + "reward_std": 0.3196341544389725, + "rewards/final_reward": 0.7710843417183383, + "rewards/mask_iou_reward": 0.38554217085916914, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8523220717906952, + "rewards/thk_ans_format_reward": 1.0, + "step": 642, + "think_completion_length": 46.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.6875, + "epoch": 1.0860033726812817, + "grad_norm": 35.921443486527984, + "kl": 0.4814453125, + "learning_rate": 7.831365935919055e-07, + "loss": 0.0005, + "reward": 3.1496444940567017, + "reward_std": 0.19438892230391502, + "rewards/final_reward": 1.4102102276977724, + "rewards/mask_iou_reward": 0.7051051138488862, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.149644523859024, + "rewards/thk_ans_format_reward": 1.0, + "step": 643, + "think_completion_length": 44.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.984375, + "epoch": 1.087689713322091, + "grad_norm": 4.382258185474288, + "kl": 0.44921875, + "learning_rate": 7.827993254637436e-07, + "loss": 0.0004, + "reward": 3.082374095916748, + "reward_std": 0.3743949681520462, + "rewards/final_reward": 1.017262522169081, + "rewards/mask_iou_reward": 0.5086312610845405, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.082374095916748, + "rewards/thk_ans_format_reward": 1.0, + "step": 644, + "think_completion_length": 45.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.515625, + "epoch": 1.0893760539629005, + "grad_norm": 5.795620025146997, + "kl": 0.458984375, + "learning_rate": 7.824620573355818e-07, + "loss": 0.0005, + "reward": 3.644650101661682, + "reward_std": 0.0579620311036706, + "rewards/final_reward": 1.3550629451891312, + "rewards/mask_iou_reward": 0.6775314725945656, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6446500420570374, + "rewards/thk_ans_format_reward": 1.0, + "step": 645, + "think_completion_length": 48.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.71875, + "epoch": 1.09106239460371, + "grad_norm": 5.570465496238775, + "kl": 0.41796875, + "learning_rate": 7.821247892074199e-07, + "loss": 0.0004, + "reward": 3.243638515472412, + "reward_std": 0.08751763962209225, + "rewards/final_reward": 0.7646918304668557, + "rewards/mask_iou_reward": 0.38234591523342787, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.243638515472412, + "rewards/thk_ans_format_reward": 1.0, + "step": 646, + "think_completion_length": 44.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.203125, + "epoch": 1.0927487352445193, + "grad_norm": 6.600286707253367, + "kl": 0.4091796875, + "learning_rate": 7.81787521079258e-07, + "loss": 0.0004, + "reward": 3.062265157699585, + "reward_std": 0.1764051355421543, + "rewards/final_reward": 0.7824303531778339, + "rewards/mask_iou_reward": 0.39121517658891697, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0622652620077133, + "rewards/thk_ans_format_reward": 1.0, + "step": 647, + "think_completion_length": 46.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.21875, + "epoch": 1.0944350758853287, + "grad_norm": 4.030924908993303, + "kl": 0.6171875, + "learning_rate": 7.814502529510961e-07, + "loss": 0.0006, + "reward": 3.7860748767852783, + "reward_std": 0.16600636392831802, + "rewards/final_reward": 1.9111873242786737, + "rewards/mask_iou_reward": 0.9555936621393368, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7860747575759888, + "rewards/thk_ans_format_reward": 1.0, + "step": 648, + "think_completion_length": 39.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.15625, + "epoch": 1.0961214165261384, + "grad_norm": 4.2376043097515055, + "kl": 0.7412109375, + "learning_rate": 7.811129848229342e-07, + "loss": 0.0007, + "reward": 3.2226240634918213, + "reward_std": 0.3179095759987831, + "rewards/final_reward": 0.7279156450622776, + "rewards/mask_iou_reward": 0.3639578225311388, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.2382490634918213, + "rewards/thk_ans_format_reward": 1.0, + "step": 649, + "think_completion_length": 44.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.4375, + "epoch": 1.0978077571669478, + "grad_norm": 5.523908512555914, + "kl": 0.4794921875, + "learning_rate": 7.807757166947723e-07, + "loss": 0.0005, + "reward": 3.006617307662964, + "reward_std": 0.14593719691038132, + "rewards/final_reward": 1.5455323930270022, + "rewards/mask_iou_reward": 0.7727661965135011, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0066173672676086, + "rewards/thk_ans_format_reward": 1.0, + "step": 650, + "think_completion_length": 43.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.1875, + "epoch": 1.0994940978077572, + "grad_norm": 4.917758171979661, + "kl": 0.4541015625, + "learning_rate": 7.804384485666104e-07, + "loss": 0.0005, + "reward": 2.8413726091384888, + "reward_std": 0.298868240788579, + "rewards/final_reward": 1.5816240118600156, + "rewards/mask_iou_reward": 0.7908120059300078, + "rewards/sam_format_reward": 0.890625, + "rewards/sam_reward_func_ultra": 0.9507474899291992, + "rewards/thk_ans_format_reward": 1.0, + "step": 651, + "think_completion_length": 41.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.4375, + "epoch": 1.1011804384485666, + "grad_norm": 8.512332945482656, + "kl": 0.4638671875, + "learning_rate": 7.801011804384485e-07, + "loss": 0.0005, + "reward": 2.937049388885498, + "reward_std": 0.3492274060845375, + "rewards/final_reward": 1.1078227273727301, + "rewards/mask_iou_reward": 0.5539113636863651, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 0.9682995975017548, + "rewards/thk_ans_format_reward": 1.0, + "step": 652, + "think_completion_length": 48.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.34375, + "epoch": 1.102866779089376, + "grad_norm": 5.278917106886176, + "kl": 0.4736328125, + "learning_rate": 7.797639123102866e-07, + "loss": 0.0005, + "reward": 3.4667773246765137, + "reward_std": 0.2953631021082401, + "rewards/final_reward": 1.650441733990713, + "rewards/mask_iou_reward": 0.8252208669953565, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4667773246765137, + "rewards/thk_ans_format_reward": 1.0, + "step": 653, + "think_completion_length": 41.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.734375, + "epoch": 1.1045531197301854, + "grad_norm": 12.133570183221005, + "kl": 0.4599609375, + "learning_rate": 7.794266441821248e-07, + "loss": 0.0005, + "reward": 3.3131359815597534, + "reward_std": 0.141848836094141, + "rewards/final_reward": 1.1049517799093656, + "rewards/mask_iou_reward": 0.5524758899546828, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3131359219551086, + "rewards/thk_ans_format_reward": 1.0, + "step": 654, + "think_completion_length": 45.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.75, + "epoch": 1.1062394603709949, + "grad_norm": 6.928059153280524, + "kl": 0.455078125, + "learning_rate": 7.790893760539629e-07, + "loss": 0.0005, + "reward": 3.330763339996338, + "reward_std": 0.46064063906669617, + "rewards/final_reward": 1.2615539547932235, + "rewards/mask_iou_reward": 0.6307769773966118, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 1.3620134592056274, + "rewards/thk_ans_format_reward": 1.0, + "step": 655, + "think_completion_length": 41.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.625, + "epoch": 1.1079258010118043, + "grad_norm": 4.186907452762862, + "kl": 0.4306640625, + "learning_rate": 7.78752107925801e-07, + "loss": 0.0004, + "reward": 3.546625852584839, + "reward_std": 0.20074513833969831, + "rewards/final_reward": 1.3358930150040598, + "rewards/mask_iou_reward": 0.6679465075020299, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5466259121894836, + "rewards/thk_ans_format_reward": 1.0, + "step": 656, + "think_completion_length": 42.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.28125, + "epoch": 1.109612141652614, + "grad_norm": 6.109888489679801, + "kl": 0.421875, + "learning_rate": 7.784148397976391e-07, + "loss": 0.0004, + "reward": 3.2946518659591675, + "reward_std": 0.34683462977409363, + "rewards/final_reward": 0.8698238201682837, + "rewards/mask_iou_reward": 0.4349119100841419, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2946519255638123, + "rewards/thk_ans_format_reward": 1.0, + "step": 657, + "think_completion_length": 41.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.671875, + "epoch": 1.1112984822934233, + "grad_norm": 4.5710675246879156, + "kl": 0.48046875, + "learning_rate": 7.780775716694772e-07, + "loss": 0.0005, + "reward": 3.563894748687744, + "reward_std": 0.2252323478460312, + "rewards/final_reward": 1.6749597198477488, + "rewards/mask_iou_reward": 0.8374798599238744, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5638948678970337, + "rewards/thk_ans_format_reward": 1.0, + "step": 658, + "think_completion_length": 46.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.5, + "epoch": 1.1129848229342327, + "grad_norm": 10.866443084152671, + "kl": 0.4345703125, + "learning_rate": 7.777403035413152e-07, + "loss": 0.0004, + "reward": 2.81168270111084, + "reward_std": 0.17690950445830822, + "rewards/final_reward": 0.8872991301042769, + "rewards/mask_iou_reward": 0.44364956505213843, + "rewards/sam_format_reward": 0.796875, + "rewards/sam_reward_func_ultra": 1.0148076713085175, + "rewards/thk_ans_format_reward": 1.0, + "step": 659, + "think_completion_length": 44.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.71875, + "epoch": 1.1146711635750421, + "grad_norm": 22.99738676784577, + "kl": 0.4794921875, + "learning_rate": 7.774030354131534e-07, + "loss": 0.0005, + "reward": 3.2107110023498535, + "reward_std": 0.24932213127613068, + "rewards/final_reward": 1.4678053004170692, + "rewards/mask_iou_reward": 0.7339026502085346, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2107109427452087, + "rewards/thk_ans_format_reward": 1.0, + "step": 660, + "think_completion_length": 41.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.15625, + "epoch": 1.1163575042158516, + "grad_norm": 61.78637021385139, + "kl": 0.4072265625, + "learning_rate": 7.770657672849915e-07, + "loss": 0.0005, + "reward": 3.1127153635025024, + "reward_std": 0.21405567973852158, + "rewards/final_reward": 1.4737091512600995, + "rewards/mask_iou_reward": 0.7368545756300497, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.11271533370018, + "rewards/thk_ans_format_reward": 1.0, + "step": 661, + "think_completion_length": 40.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.125, + "epoch": 1.118043844856661, + "grad_norm": 6.807626678475442, + "kl": 0.525390625, + "learning_rate": 7.767284991568297e-07, + "loss": 0.0005, + "reward": 3.3367409706115723, + "reward_std": 0.23176462203264236, + "rewards/final_reward": 1.0719849956783123, + "rewards/mask_iou_reward": 0.5359924978391561, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3367410898208618, + "rewards/thk_ans_format_reward": 1.0, + "step": 662, + "think_completion_length": 42.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.84375, + "epoch": 1.1197301854974704, + "grad_norm": 10.173401210446599, + "kl": 0.455078125, + "learning_rate": 7.763912310286678e-07, + "loss": 0.0005, + "reward": 2.986254930496216, + "reward_std": 0.34364715963602066, + "rewards/final_reward": 1.1272437953036465, + "rewards/mask_iou_reward": 0.5636218976518232, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9862548112869263, + "rewards/thk_ans_format_reward": 1.0, + "step": 663, + "think_completion_length": 42.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.265625, + "epoch": 1.12141652613828, + "grad_norm": 8.303520314295719, + "kl": 0.5380859375, + "learning_rate": 7.760539629005059e-07, + "loss": 0.0005, + "reward": 2.907498359680176, + "reward_std": 0.31269126385450363, + "rewards/final_reward": 0.7283450222985134, + "rewards/mask_iou_reward": 0.3641725111492567, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9074984192848206, + "rewards/thk_ans_format_reward": 1.0, + "step": 664, + "think_completion_length": 44.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.1875, + "epoch": 1.1231028667790894, + "grad_norm": 6.347832378595188, + "kl": 0.421875, + "learning_rate": 7.757166947723441e-07, + "loss": 0.0004, + "reward": 2.9704482555389404, + "reward_std": 0.13693349808454514, + "rewards/final_reward": 0.6053038205227258, + "rewards/mask_iou_reward": 0.3026519102613629, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9704483151435852, + "rewards/thk_ans_format_reward": 1.0, + "step": 665, + "think_completion_length": 41.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.828125, + "epoch": 1.1247892074198989, + "grad_norm": 8.262589575211235, + "kl": 0.423828125, + "learning_rate": 7.753794266441821e-07, + "loss": 0.0004, + "reward": 3.4435518980026245, + "reward_std": 0.18118244968354702, + "rewards/final_reward": 1.1758351428381058, + "rewards/mask_iou_reward": 0.5879175714190529, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4435516595840454, + "rewards/thk_ans_format_reward": 1.0, + "step": 666, + "think_completion_length": 44.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.3125, + "epoch": 1.1264755480607083, + "grad_norm": 11.73547579152421, + "kl": 0.4765625, + "learning_rate": 7.750421585160201e-07, + "loss": 0.0005, + "reward": 3.694632411003113, + "reward_std": 0.11483868956565857, + "rewards/final_reward": 1.6840263089165557, + "rewards/mask_iou_reward": 0.8420131544582778, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6946325302124023, + "rewards/thk_ans_format_reward": 1.0, + "step": 667, + "think_completion_length": 45.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.546875, + "epoch": 1.1281618887015177, + "grad_norm": 6.678149708156542, + "kl": 0.568359375, + "learning_rate": 7.747048903878583e-07, + "loss": 0.0006, + "reward": 2.7610191106796265, + "reward_std": 0.4594908654689789, + "rewards/final_reward": 0.8904040467556769, + "rewards/mask_iou_reward": 0.44520202337783843, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7610192000865936, + "rewards/thk_ans_format_reward": 1.0, + "step": 668, + "think_completion_length": 39.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.96875, + "epoch": 1.129848229342327, + "grad_norm": 6.113441041476647, + "kl": 0.419921875, + "learning_rate": 7.743676222596964e-07, + "loss": 0.0004, + "reward": 3.385975956916809, + "reward_std": 0.154005765914917, + "rewards/final_reward": 1.206318374353144, + "rewards/mask_iou_reward": 0.603159187176572, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3859760165214539, + "rewards/thk_ans_format_reward": 1.0, + "step": 669, + "think_completion_length": 42.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.109375, + "epoch": 1.1315345699831365, + "grad_norm": 4.674232401056913, + "kl": 0.4599609375, + "learning_rate": 7.740303541315345e-07, + "loss": 0.0004, + "reward": 3.6678245067596436, + "reward_std": 0.02954744128510356, + "rewards/final_reward": 1.8396449375646342, + "rewards/mask_iou_reward": 0.9198224687823171, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.667824625968933, + "rewards/thk_ans_format_reward": 1.0, + "step": 670, + "think_completion_length": 41.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.265625, + "epoch": 1.1332209106239461, + "grad_norm": 9.631130618354605, + "kl": 0.462890625, + "learning_rate": 7.736930860033727e-07, + "loss": 0.0005, + "reward": 3.1673879623413086, + "reward_std": 0.046774497255682945, + "rewards/final_reward": 1.6174178301439586, + "rewards/mask_iou_reward": 0.8087089150719793, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.167387992143631, + "rewards/thk_ans_format_reward": 1.0, + "step": 671, + "think_completion_length": 41.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.96875, + "epoch": 1.1349072512647556, + "grad_norm": 9.80047585453591, + "kl": 0.42578125, + "learning_rate": 7.733558178752108e-07, + "loss": 0.0004, + "reward": 3.186527371406555, + "reward_std": 0.2629920244216919, + "rewards/final_reward": 1.4596871984128676, + "rewards/mask_iou_reward": 0.7298435992064338, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1865273118019104, + "rewards/thk_ans_format_reward": 1.0, + "step": 672, + "think_completion_length": 41.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.25, + "epoch": 1.136593591905565, + "grad_norm": 8.232232103136324, + "kl": 0.421875, + "learning_rate": 7.730185497470489e-07, + "loss": 0.0004, + "reward": 3.802139163017273, + "reward_std": 0.16265618288889527, + "rewards/final_reward": 1.928601635210926, + "rewards/mask_iou_reward": 0.964300817605463, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.802139163017273, + "rewards/thk_ans_format_reward": 1.0, + "step": 673, + "think_completion_length": 44.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.90625, + "epoch": 1.1382799325463744, + "grad_norm": 5.270149298222591, + "kl": 0.45703125, + "learning_rate": 7.726812816188871e-07, + "loss": 0.0005, + "reward": 3.2774369716644287, + "reward_std": 0.19858654215931892, + "rewards/final_reward": 1.1144515948091382, + "rewards/mask_iou_reward": 0.5572257974045691, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2774368524551392, + "rewards/thk_ans_format_reward": 1.0, + "step": 674, + "think_completion_length": 33.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.375, + "epoch": 1.1399662731871838, + "grad_norm": 6.837655439090573, + "kl": 1.76171875, + "learning_rate": 7.72344013490725e-07, + "loss": 0.0018, + "reward": 3.363555073738098, + "reward_std": 0.28620167076587677, + "rewards/final_reward": 1.5152913358286324, + "rewards/mask_iou_reward": 0.7576456679143162, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3635550737380981, + "rewards/thk_ans_format_reward": 1.0, + "step": 675, + "think_completion_length": 40.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.203125, + "epoch": 1.1416526138279932, + "grad_norm": 6.200267618177697, + "kl": 0.41796875, + "learning_rate": 7.720067453625631e-07, + "loss": 0.0004, + "reward": 3.6870086193084717, + "reward_std": 0.17093387246131897, + "rewards/final_reward": 1.5394327535416457, + "rewards/mask_iou_reward": 0.7697163767708228, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.702633798122406, + "rewards/thk_ans_format_reward": 0.984375, + "step": 676, + "think_completion_length": 45.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.09375, + "epoch": 1.1433389544688026, + "grad_norm": 8.51211024924678, + "kl": 0.44140625, + "learning_rate": 7.716694772344013e-07, + "loss": 0.0004, + "reward": 2.8949872255325317, + "reward_std": 0.41623103618621826, + "rewards/final_reward": 0.2714590533213712, + "rewards/mask_iou_reward": 0.1357295266606856, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8949873745441437, + "rewards/thk_ans_format_reward": 1.0, + "step": 677, + "think_completion_length": 43.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.21875, + "epoch": 1.1450252951096123, + "grad_norm": 8.563116450543093, + "kl": 0.4755859375, + "learning_rate": 7.713322091062394e-07, + "loss": 0.0005, + "reward": 2.552576780319214, + "reward_std": 0.3992340862751007, + "rewards/final_reward": 0.848194710225629, + "rewards/mask_iou_reward": 0.4240973551128145, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5525768399238586, + "rewards/thk_ans_format_reward": 1.0, + "step": 678, + "think_completion_length": 44.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.703125, + "epoch": 1.1467116357504217, + "grad_norm": 6.127334987076423, + "kl": 0.4169921875, + "learning_rate": 7.709949409780775e-07, + "loss": 0.0004, + "reward": 3.4885432720184326, + "reward_std": 0.19754197634756565, + "rewards/final_reward": 1.5432886049163599, + "rewards/mask_iou_reward": 0.7716443024581799, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4885433316230774, + "rewards/thk_ans_format_reward": 1.0, + "step": 679, + "think_completion_length": 40.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.390625, + "epoch": 1.148397976391231, + "grad_norm": 4.875342612314185, + "kl": 0.419921875, + "learning_rate": 7.706576728499157e-07, + "loss": 0.0004, + "reward": 2.6762442588806152, + "reward_std": 0.27623605728149414, + "rewards/final_reward": 1.3332219909973353, + "rewards/mask_iou_reward": 0.6666109954986676, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6762442886829376, + "rewards/thk_ans_format_reward": 1.0, + "step": 680, + "think_completion_length": 46.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.375, + "epoch": 1.1500843170320405, + "grad_norm": 25.542862127950208, + "kl": 0.3955078125, + "learning_rate": 7.703204047217538e-07, + "loss": 0.0004, + "reward": 3.241006016731262, + "reward_std": 0.1869177557528019, + "rewards/final_reward": 1.3966232754602865, + "rewards/mask_iou_reward": 0.6983116377301433, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2410059869289398, + "rewards/thk_ans_format_reward": 1.0, + "step": 681, + "think_completion_length": 45.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.4375, + "epoch": 1.15177065767285, + "grad_norm": 6.6579015132829715, + "kl": 0.4658203125, + "learning_rate": 7.699831365935919e-07, + "loss": 0.0005, + "reward": 3.1924837827682495, + "reward_std": 0.29617342352867126, + "rewards/final_reward": 1.2179031696155878, + "rewards/mask_iou_reward": 0.6089515848077939, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.192483812570572, + "rewards/thk_ans_format_reward": 1.0, + "step": 682, + "think_completion_length": 43.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.984375, + "epoch": 1.1534569983136593, + "grad_norm": 15.655594911719893, + "kl": 0.44921875, + "learning_rate": 7.6964586846543e-07, + "loss": 0.0005, + "reward": 3.237351417541504, + "reward_std": 0.21372727304697037, + "rewards/final_reward": 1.0931966809390936, + "rewards/mask_iou_reward": 0.5465983404695468, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2373515665531158, + "rewards/thk_ans_format_reward": 1.0, + "step": 683, + "think_completion_length": 42.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.90625, + "epoch": 1.1551433389544687, + "grad_norm": 5.186245702340746, + "kl": 0.4375, + "learning_rate": 7.69308600337268e-07, + "loss": 0.0004, + "reward": 3.1966097354888916, + "reward_std": 0.27127550914883614, + "rewards/final_reward": 1.0101069758644736, + "rewards/mask_iou_reward": 0.5050534879322368, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1966098546981812, + "rewards/thk_ans_format_reward": 1.0, + "step": 684, + "think_completion_length": 39.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.390625, + "epoch": 1.1568296795952782, + "grad_norm": 10.1943751403639, + "kl": 0.37109375, + "learning_rate": 7.689713322091061e-07, + "loss": 0.0003, + "reward": 2.9875484704971313, + "reward_std": 0.1596406251192093, + "rewards/final_reward": 1.1998612642817665, + "rewards/mask_iou_reward": 0.5999306321408833, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9875486344099045, + "rewards/thk_ans_format_reward": 1.0, + "step": 685, + "think_completion_length": 44.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.3125, + "epoch": 1.1585160202360876, + "grad_norm": 12.939471681884397, + "kl": 0.4775390625, + "learning_rate": 7.686340640809443e-07, + "loss": 0.0005, + "reward": 3.0023581981658936, + "reward_std": 0.2409796817228198, + "rewards/final_reward": 1.081150993998099, + "rewards/mask_iou_reward": 0.5405754969990495, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0023581981658936, + "rewards/thk_ans_format_reward": 1.0, + "step": 686, + "think_completion_length": 48.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.640625, + "epoch": 1.1602023608768972, + "grad_norm": 5.825517597219945, + "kl": 0.4091796875, + "learning_rate": 7.682967959527824e-07, + "loss": 0.0004, + "reward": 2.737739324569702, + "reward_std": 0.36891575902700424, + "rewards/final_reward": 0.8585461742314954, + "rewards/mask_iou_reward": 0.4292730871157477, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7377393543720245, + "rewards/thk_ans_format_reward": 1.0, + "step": 687, + "think_completion_length": 43.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.390625, + "epoch": 1.1618887015177066, + "grad_norm": 3.7055147438675196, + "kl": 0.392578125, + "learning_rate": 7.679595278246206e-07, + "loss": 0.0004, + "reward": 3.136894106864929, + "reward_std": 0.44828712940216064, + "rewards/final_reward": 1.4373869457406165, + "rewards/mask_iou_reward": 0.7186934728703083, + "rewards/sam_format_reward": 0.921875, + "rewards/sam_reward_func_ultra": 1.2150189876556396, + "rewards/thk_ans_format_reward": 1.0, + "step": 688, + "think_completion_length": 43.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.625, + "epoch": 1.163575042158516, + "grad_norm": 10.711703911868902, + "kl": 0.3662109375, + "learning_rate": 7.676222596964587e-07, + "loss": 0.0004, + "reward": 3.549091100692749, + "reward_std": 0.0804364699870348, + "rewards/final_reward": 1.502469671340829, + "rewards/mask_iou_reward": 0.7512348356704145, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5490912199020386, + "rewards/thk_ans_format_reward": 1.0, + "step": 689, + "think_completion_length": 47.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.78125, + "epoch": 1.1652613827993255, + "grad_norm": 14.365059467586935, + "kl": 0.505859375, + "learning_rate": 7.672849915682968e-07, + "loss": 0.0005, + "reward": 2.859722137451172, + "reward_std": 0.0706297755241394, + "rewards/final_reward": 1.3300087058547911, + "rewards/mask_iou_reward": 0.6650043529273956, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8597220778465271, + "rewards/thk_ans_format_reward": 1.0, + "step": 690, + "think_completion_length": 48.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.59375, + "epoch": 1.1669477234401349, + "grad_norm": 13.83030640010574, + "kl": 0.451171875, + "learning_rate": 7.66947723440135e-07, + "loss": 0.0005, + "reward": 3.6626336574554443, + "reward_std": 0.08809349499642849, + "rewards/final_reward": 1.4519475682557976, + "rewards/mask_iou_reward": 0.7259737841278988, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6626335382461548, + "rewards/thk_ans_format_reward": 1.0, + "step": 691, + "think_completion_length": 44.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.234375, + "epoch": 1.1686340640809443, + "grad_norm": 5.7118411670163045, + "kl": 0.451171875, + "learning_rate": 7.666104553119729e-07, + "loss": 0.0005, + "reward": 3.2752153873443604, + "reward_std": 0.22305817902088165, + "rewards/final_reward": 1.0218888606161036, + "rewards/mask_iou_reward": 0.5109444303080518, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2752153277397156, + "rewards/thk_ans_format_reward": 1.0, + "step": 692, + "think_completion_length": 42.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.953125, + "epoch": 1.1703204047217537, + "grad_norm": 5.879210219257272, + "kl": 0.4287109375, + "learning_rate": 7.66273187183811e-07, + "loss": 0.0004, + "reward": 3.4141026735305786, + "reward_std": 0.11216456070542336, + "rewards/final_reward": 0.9649098707837429, + "rewards/mask_iou_reward": 0.48245493539187145, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4141026735305786, + "rewards/thk_ans_format_reward": 1.0, + "step": 693, + "think_completion_length": 44.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.65625, + "epoch": 1.1720067453625633, + "grad_norm": 4.885554768853665, + "kl": 0.48828125, + "learning_rate": 7.659359190556492e-07, + "loss": 0.0005, + "reward": 3.471671462059021, + "reward_std": 0.023879871238023043, + "rewards/final_reward": 1.0082511677110773, + "rewards/mask_iou_reward": 0.5041255838555386, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.471671223640442, + "rewards/thk_ans_format_reward": 1.0, + "step": 694, + "think_completion_length": 47.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.6875, + "epoch": 1.1736930860033727, + "grad_norm": 4.265041635129777, + "kl": 0.458984375, + "learning_rate": 7.655986509274873e-07, + "loss": 0.0005, + "reward": 2.813219904899597, + "reward_std": 0.06556128861848265, + "rewards/final_reward": 0.6052844679476623, + "rewards/mask_iou_reward": 0.30264223397383117, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8132199048995972, + "rewards/thk_ans_format_reward": 1.0, + "step": 695, + "think_completion_length": 46.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.140625, + "epoch": 1.1753794266441822, + "grad_norm": 5.2388044987299045, + "kl": 0.3984375, + "learning_rate": 7.652613827993254e-07, + "loss": 0.0004, + "reward": 3.579859733581543, + "reward_std": 0.059681566432118416, + "rewards/final_reward": 1.395962637751189, + "rewards/mask_iou_reward": 0.6979813188755944, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.579859733581543, + "rewards/thk_ans_format_reward": 1.0, + "step": 696, + "think_completion_length": 39.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.078125, + "epoch": 1.1770657672849916, + "grad_norm": 6.088857849607072, + "kl": 0.337890625, + "learning_rate": 7.649241146711636e-07, + "loss": 0.0003, + "reward": 3.479218006134033, + "reward_std": 0.1960524395108223, + "rewards/final_reward": 1.3349460746520103, + "rewards/mask_iou_reward": 0.6674730373260052, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4792180061340332, + "rewards/thk_ans_format_reward": 1.0, + "step": 697, + "think_completion_length": 39.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.15625, + "epoch": 1.178752107925801, + "grad_norm": 6.866854285004716, + "kl": 0.4990234375, + "learning_rate": 7.645868465430017e-07, + "loss": 0.0004, + "reward": 3.4718170166015625, + "reward_std": 0.23394297808408737, + "rewards/final_reward": 1.3271848039220981, + "rewards/mask_iou_reward": 0.6635924019610491, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4874420166015625, + "rewards/thk_ans_format_reward": 0.984375, + "step": 698, + "think_completion_length": 43.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.921875, + "epoch": 1.1804384485666104, + "grad_norm": 5.382111118101294, + "kl": 0.453125, + "learning_rate": 7.642495784148398e-07, + "loss": 0.0005, + "reward": 3.5155398845672607, + "reward_std": 0.23116005957126617, + "rewards/final_reward": 1.6273760561564208, + "rewards/mask_iou_reward": 0.8136880280782104, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.515539824962616, + "rewards/thk_ans_format_reward": 1.0, + "step": 699, + "think_completion_length": 39.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.453125, + "epoch": 1.1821247892074198, + "grad_norm": 6.276060150190441, + "kl": 0.4521484375, + "learning_rate": 7.639123102866779e-07, + "loss": 0.0005, + "reward": 3.605965256690979, + "reward_std": 0.26602135598659515, + "rewards/final_reward": 1.7934416230025474, + "rewards/mask_iou_reward": 0.8967208115012737, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.605965256690979, + "rewards/thk_ans_format_reward": 1.0, + "step": 700, + "think_completion_length": 38.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.421875, + "epoch": 1.1838111298482294, + "grad_norm": 14.606307023698015, + "kl": 0.41796875, + "learning_rate": 7.635750421585159e-07, + "loss": 0.0004, + "reward": 2.9312102794647217, + "reward_std": 0.1353270411491394, + "rewards/final_reward": 0.8034653759214077, + "rewards/mask_iou_reward": 0.4017326879607038, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9312102794647217, + "rewards/thk_ans_format_reward": 1.0, + "step": 701, + "think_completion_length": 40.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.078125, + "epoch": 1.1854974704890389, + "grad_norm": 6.197539138196918, + "kl": 0.515625, + "learning_rate": 7.63237774030354e-07, + "loss": 0.0005, + "reward": 3.6228188276290894, + "reward_std": 0.27496435306966305, + "rewards/final_reward": 1.81913042546311, + "rewards/mask_iou_reward": 0.909565212731555, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6228187680244446, + "rewards/thk_ans_format_reward": 1.0, + "step": 702, + "think_completion_length": 45.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.6875, + "epoch": 1.1871838111298483, + "grad_norm": 9.608806612788296, + "kl": 0.5234375, + "learning_rate": 7.629005059021922e-07, + "loss": 0.0005, + "reward": 3.431188225746155, + "reward_std": 0.208267442882061, + "rewards/final_reward": 1.4498300708540364, + "rewards/mask_iou_reward": 0.7249150354270182, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4311882853507996, + "rewards/thk_ans_format_reward": 1.0, + "step": 703, + "think_completion_length": 39.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.9375, + "epoch": 1.1888701517706577, + "grad_norm": 5.504246849404247, + "kl": 0.537109375, + "learning_rate": 7.625632377740303e-07, + "loss": 0.0005, + "reward": 3.801710367202759, + "reward_std": 0.17118039727210999, + "rewards/final_reward": 1.7455660960035957, + "rewards/mask_iou_reward": 0.8727830480017978, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8017104268074036, + "rewards/thk_ans_format_reward": 1.0, + "step": 704, + "think_completion_length": 37.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.828125, + "epoch": 1.190556492411467, + "grad_norm": 11.153695855520748, + "kl": 0.525390625, + "learning_rate": 7.622259696458684e-07, + "loss": 0.0005, + "reward": 3.039560914039612, + "reward_std": 0.20383157022297382, + "rewards/final_reward": 1.255676750998822, + "rewards/mask_iou_reward": 0.627838375499411, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0395609438419342, + "rewards/thk_ans_format_reward": 1.0, + "step": 705, + "think_completion_length": 43.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.34375, + "epoch": 1.1922428330522765, + "grad_norm": 11.045631563315313, + "kl": 0.4892578125, + "learning_rate": 7.618887015177066e-07, + "loss": 0.0004, + "reward": 3.4573845863342285, + "reward_std": 0.01934580714441836, + "rewards/final_reward": 1.1347815513852373, + "rewards/mask_iou_reward": 0.5673907756926186, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4573845863342285, + "rewards/thk_ans_format_reward": 1.0, + "step": 706, + "think_completion_length": 42.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.53125, + "epoch": 1.193929173693086, + "grad_norm": 6.701662389508587, + "kl": 0.546875, + "learning_rate": 7.615514333895447e-07, + "loss": 0.0005, + "reward": 3.5015335083007812, + "reward_std": 0.1357786562293768, + "rewards/final_reward": 1.0802052054652325, + "rewards/mask_iou_reward": 0.5401026027326162, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5015335083007812, + "rewards/thk_ans_format_reward": 1.0, + "step": 707, + "think_completion_length": 47.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.21875, + "epoch": 1.1956155143338956, + "grad_norm": 6.3454562069427025, + "kl": 0.529296875, + "learning_rate": 7.612141652613827e-07, + "loss": 0.0005, + "reward": 3.2759647369384766, + "reward_std": 0.2564007118344307, + "rewards/final_reward": 1.594199117179952, + "rewards/mask_iou_reward": 0.797099558589976, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2759648561477661, + "rewards/thk_ans_format_reward": 1.0, + "step": 708, + "think_completion_length": 38.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.96875, + "epoch": 1.197301854974705, + "grad_norm": 7.93254910361403, + "kl": 0.474609375, + "learning_rate": 7.608768971332209e-07, + "loss": 0.0005, + "reward": 3.0473880767822266, + "reward_std": 0.2008904181420803, + "rewards/final_reward": 0.5035804906873941, + "rewards/mask_iou_reward": 0.25179024534369704, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0473880767822266, + "rewards/thk_ans_format_reward": 1.0, + "step": 709, + "think_completion_length": 36.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.15625, + "epoch": 1.1989881956155144, + "grad_norm": 24.9492335778285, + "kl": 0.48046875, + "learning_rate": 7.605396290050589e-07, + "loss": 0.0005, + "reward": 3.3758158683776855, + "reward_std": 0.176735520362854, + "rewards/final_reward": 1.6817929384532593, + "rewards/mask_iou_reward": 0.8408964692266296, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3758159279823303, + "rewards/thk_ans_format_reward": 1.0, + "step": 710, + "think_completion_length": 39.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.421875, + "epoch": 1.2006745362563238, + "grad_norm": 6.6202653063539225, + "kl": 0.462890625, + "learning_rate": 7.602023608768971e-07, + "loss": 0.0005, + "reward": 3.522263765335083, + "reward_std": 0.15052516479045153, + "rewards/final_reward": 1.788935256001427, + "rewards/mask_iou_reward": 0.8944676280007136, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.522263765335083, + "rewards/thk_ans_format_reward": 1.0, + "step": 711, + "think_completion_length": 38.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.171875, + "epoch": 1.2023608768971332, + "grad_norm": 8.148628385247655, + "kl": 0.498046875, + "learning_rate": 7.598650927487352e-07, + "loss": 0.0005, + "reward": 3.5046751499176025, + "reward_std": 0.18944399803876877, + "rewards/final_reward": 1.4865641360598838, + "rewards/mask_iou_reward": 0.7432820680299419, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5046750903129578, + "rewards/thk_ans_format_reward": 1.0, + "step": 712, + "think_completion_length": 39.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.8125, + "epoch": 1.2040472175379426, + "grad_norm": 18.178155199698164, + "kl": 0.4921875, + "learning_rate": 7.595278246205733e-07, + "loss": 0.0005, + "reward": 3.5088882446289062, + "reward_std": 0.2718805819749832, + "rewards/final_reward": 1.3515101299801353, + "rewards/mask_iou_reward": 0.6757550649900677, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5088882446289062, + "rewards/thk_ans_format_reward": 1.0, + "step": 713, + "think_completion_length": 38.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.453125, + "epoch": 1.205733558178752, + "grad_norm": 4.136701065150466, + "kl": 0.5419921875, + "learning_rate": 7.591905564924115e-07, + "loss": 0.0005, + "reward": 3.2674922943115234, + "reward_std": 0.12187814339995384, + "rewards/final_reward": 0.7277545366620094, + "rewards/mask_iou_reward": 0.3638772683310047, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2674922943115234, + "rewards/thk_ans_format_reward": 1.0, + "step": 714, + "think_completion_length": 41.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.40625, + "epoch": 1.2074198988195615, + "grad_norm": 7.655670942990928, + "kl": 0.4765625, + "learning_rate": 7.588532883642496e-07, + "loss": 0.0005, + "reward": 3.6942955255508423, + "reward_std": 0.2473888397216797, + "rewards/final_reward": 1.7120549679054045, + "rewards/mask_iou_reward": 0.8560274839527022, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.709920585155487, + "rewards/thk_ans_format_reward": 1.0, + "step": 715, + "think_completion_length": 40.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.84375, + "epoch": 1.2091062394603709, + "grad_norm": 20.231831063772177, + "kl": 0.482421875, + "learning_rate": 7.585160202360877e-07, + "loss": 0.0005, + "reward": 2.921970248222351, + "reward_std": 0.13596704229712486, + "rewards/final_reward": 0.8988025978332418, + "rewards/mask_iou_reward": 0.4494012989166209, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9219703376293182, + "rewards/thk_ans_format_reward": 1.0, + "step": 716, + "think_completion_length": 39.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.90625, + "epoch": 1.2107925801011805, + "grad_norm": 8.67676356701758, + "kl": 0.46875, + "learning_rate": 7.581787521079258e-07, + "loss": 0.0005, + "reward": 3.2619752883911133, + "reward_std": 0.24443187564611435, + "rewards/final_reward": 1.20501075584318, + "rewards/mask_iou_reward": 0.60250537792159, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2619754672050476, + "rewards/thk_ans_format_reward": 1.0, + "step": 717, + "think_completion_length": 39.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.078125, + "epoch": 1.21247892074199, + "grad_norm": 4.620316797154903, + "kl": 0.4384765625, + "learning_rate": 7.578414839797639e-07, + "loss": 0.0004, + "reward": 3.1769468784332275, + "reward_std": 0.18927378207445145, + "rewards/final_reward": 1.619022070116455, + "rewards/mask_iou_reward": 0.8095110350582275, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1769469380378723, + "rewards/thk_ans_format_reward": 1.0, + "step": 718, + "think_completion_length": 43.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.296875, + "epoch": 1.2141652613827993, + "grad_norm": 4.793946634699995, + "kl": 0.486328125, + "learning_rate": 7.575042158516019e-07, + "loss": 0.0004, + "reward": 3.5579320192337036, + "reward_std": 0.04817063407972455, + "rewards/final_reward": 1.8388994591552934, + "rewards/mask_iou_reward": 0.9194497295776467, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5579320192337036, + "rewards/thk_ans_format_reward": 1.0, + "step": 719, + "think_completion_length": 43.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.90625, + "epoch": 1.2158516020236088, + "grad_norm": 6.354021636595106, + "kl": 0.552734375, + "learning_rate": 7.571669477234401e-07, + "loss": 0.0006, + "reward": 3.1674487590789795, + "reward_std": 0.14819223433732986, + "rewards/final_reward": 0.8902573793870587, + "rewards/mask_iou_reward": 0.44512868969352937, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1674485504627228, + "rewards/thk_ans_format_reward": 1.0, + "step": 720, + "think_completion_length": 42.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.515625, + "epoch": 1.2175379426644182, + "grad_norm": 10.438214677628338, + "kl": 0.4658203125, + "learning_rate": 7.568296795952782e-07, + "loss": 0.0005, + "reward": 3.6784117221832275, + "reward_std": 0.18426834791898727, + "rewards/final_reward": 1.4776055255912712, + "rewards/mask_iou_reward": 0.7388027627956356, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6784114241600037, + "rewards/thk_ans_format_reward": 1.0, + "step": 721, + "think_completion_length": 48.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.984375, + "epoch": 1.2192242833052276, + "grad_norm": 79.1925794146455, + "kl": 0.4716796875, + "learning_rate": 7.564924114671163e-07, + "loss": 0.0005, + "reward": 3.3132262229919434, + "reward_std": 0.05754976533353329, + "rewards/final_reward": 1.12228992876371, + "rewards/mask_iou_reward": 0.561144964381855, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3132262229919434, + "rewards/thk_ans_format_reward": 1.0, + "step": 722, + "think_completion_length": 46.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.15625, + "epoch": 1.220910623946037, + "grad_norm": 60.04001298477758, + "kl": 0.451171875, + "learning_rate": 7.561551433389545e-07, + "loss": 0.0005, + "reward": 3.064196825027466, + "reward_std": 0.3282191604375839, + "rewards/final_reward": 0.6696442415236867, + "rewards/mask_iou_reward": 0.33482212076184337, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.064196765422821, + "rewards/thk_ans_format_reward": 1.0, + "step": 723, + "think_completion_length": 45.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.8125, + "epoch": 1.2225969645868466, + "grad_norm": 8.569614982497429, + "kl": 0.4111328125, + "learning_rate": 7.558178752107926e-07, + "loss": 0.0004, + "reward": 3.5133100748062134, + "reward_std": 0.053193164989352226, + "rewards/final_reward": 1.3529733064799343, + "rewards/mask_iou_reward": 0.6764866532399672, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5133100152015686, + "rewards/thk_ans_format_reward": 1.0, + "step": 724, + "think_completion_length": 45.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.953125, + "epoch": 1.224283305227656, + "grad_norm": 9.357989983029983, + "kl": 1.125, + "learning_rate": 7.554806070826306e-07, + "loss": 0.0011, + "reward": 3.160015821456909, + "reward_std": 0.32910278625786304, + "rewards/final_reward": 1.1335708589589917, + "rewards/mask_iou_reward": 0.5667854294794958, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.160015881061554, + "rewards/thk_ans_format_reward": 1.0, + "step": 725, + "think_completion_length": 48.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.4375, + "epoch": 1.2259696458684655, + "grad_norm": 5.024754635183879, + "kl": 0.478515625, + "learning_rate": 7.551433389544688e-07, + "loss": 0.0005, + "reward": 3.1202811002731323, + "reward_std": 0.18038739264011383, + "rewards/final_reward": 0.9122605295219147, + "rewards/mask_iou_reward": 0.45613026476095736, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1202812194824219, + "rewards/thk_ans_format_reward": 1.0, + "step": 726, + "think_completion_length": 45.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.375, + "epoch": 1.2276559865092749, + "grad_norm": 6.988500133578457, + "kl": 0.4287109375, + "learning_rate": 7.548060708263069e-07, + "loss": 0.0004, + "reward": 3.2941445112228394, + "reward_std": 0.3624084070324898, + "rewards/final_reward": 1.4880099850929445, + "rewards/mask_iou_reward": 0.7440049925464722, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2941443920135498, + "rewards/thk_ans_format_reward": 1.0, + "step": 727, + "think_completion_length": 44.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.53125, + "epoch": 1.2293423271500843, + "grad_norm": 4.250688941174817, + "kl": 0.416015625, + "learning_rate": 7.544688026981449e-07, + "loss": 0.0004, + "reward": 3.2780280113220215, + "reward_std": 0.0592149943113327, + "rewards/final_reward": 1.6572678327724175, + "rewards/mask_iou_reward": 0.8286339163862088, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2780280113220215, + "rewards/thk_ans_format_reward": 1.0, + "step": 728, + "think_completion_length": 49.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.875, + "epoch": 1.2310286677908937, + "grad_norm": 10.70609575483574, + "kl": 0.4052734375, + "learning_rate": 7.541315345699831e-07, + "loss": 0.0004, + "reward": 3.305624485015869, + "reward_std": 0.20255491137504578, + "rewards/final_reward": 1.612820795771751, + "rewards/mask_iou_reward": 0.8064103978858755, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.305624544620514, + "rewards/thk_ans_format_reward": 1.0, + "step": 729, + "think_completion_length": 46.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.609375, + "epoch": 1.2327150084317031, + "grad_norm": 19.39244591936634, + "kl": 0.4228515625, + "learning_rate": 7.537942664418212e-07, + "loss": 0.0004, + "reward": 3.4895849227905273, + "reward_std": 0.35745085775852203, + "rewards/final_reward": 1.6071061379815976, + "rewards/mask_iou_reward": 0.8035530689907988, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4895849823951721, + "rewards/thk_ans_format_reward": 1.0, + "step": 730, + "think_completion_length": 51.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.28125, + "epoch": 1.2344013490725128, + "grad_norm": 5.224728801556268, + "kl": 0.392578125, + "learning_rate": 7.534569983136593e-07, + "loss": 0.0004, + "reward": 3.190197229385376, + "reward_std": 0.1471584215760231, + "rewards/final_reward": 1.115054770568458, + "rewards/mask_iou_reward": 0.557527385284229, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1901973485946655, + "rewards/thk_ans_format_reward": 1.0, + "step": 731, + "think_completion_length": 53.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.140625, + "epoch": 1.2360876897133222, + "grad_norm": 6.368329904579315, + "kl": 0.408203125, + "learning_rate": 7.531197301854975e-07, + "loss": 0.0004, + "reward": 2.9243475198745728, + "reward_std": 0.15122611075639725, + "rewards/final_reward": 0.876438929545167, + "rewards/mask_iou_reward": 0.4382194647725835, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9243474006652832, + "rewards/thk_ans_format_reward": 1.0, + "step": 732, + "think_completion_length": 51.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.359375, + "epoch": 1.2377740303541316, + "grad_norm": 7.636127726903107, + "kl": 0.513671875, + "learning_rate": 7.527824620573355e-07, + "loss": 0.0005, + "reward": 3.237205386161804, + "reward_std": 0.4117434173822403, + "rewards/final_reward": 1.32772648991989, + "rewards/mask_iou_reward": 0.663863244959945, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.237205445766449, + "rewards/thk_ans_format_reward": 1.0, + "step": 733, + "think_completion_length": 50.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.5625, + "epoch": 1.239460370994941, + "grad_norm": 4.156925317466383, + "kl": 0.423828125, + "learning_rate": 7.524451939291736e-07, + "loss": 0.0004, + "reward": 3.244380831718445, + "reward_std": 0.2315196357667446, + "rewards/final_reward": 1.3360523780132965, + "rewards/mask_iou_reward": 0.6680261890066482, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2443808317184448, + "rewards/thk_ans_format_reward": 1.0, + "step": 734, + "think_completion_length": 49.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.609375, + "epoch": 1.2411467116357504, + "grad_norm": 8.29440186819507, + "kl": 0.416015625, + "learning_rate": 7.521079258010118e-07, + "loss": 0.0004, + "reward": 3.2429388761520386, + "reward_std": 0.285244956612587, + "rewards/final_reward": 1.1256669425154386, + "rewards/mask_iou_reward": 0.5628334712577193, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2429389357566833, + "rewards/thk_ans_format_reward": 1.0, + "step": 735, + "think_completion_length": 45.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.15625, + "epoch": 1.2428330522765598, + "grad_norm": 6.871379030140198, + "kl": 0.439453125, + "learning_rate": 7.517706576728498e-07, + "loss": 0.0004, + "reward": 2.851890802383423, + "reward_std": 0.29892025887966156, + "rewards/final_reward": 0.9705862343518611, + "rewards/mask_iou_reward": 0.48529311717593054, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8518907129764557, + "rewards/thk_ans_format_reward": 1.0, + "step": 736, + "think_completion_length": 46.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.328125, + "epoch": 1.2445193929173692, + "grad_norm": 5.674713470841899, + "kl": 0.4462890625, + "learning_rate": 7.51433389544688e-07, + "loss": 0.0004, + "reward": 3.2446606159210205, + "reward_std": 0.3162437919527292, + "rewards/final_reward": 1.2440891629731174, + "rewards/mask_iou_reward": 0.6220445814865587, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.244660496711731, + "rewards/thk_ans_format_reward": 1.0, + "step": 737, + "think_completion_length": 48.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.296875, + "epoch": 1.2462057335581789, + "grad_norm": 5.272017288869077, + "kl": 0.4306640625, + "learning_rate": 7.510961214165261e-07, + "loss": 0.0004, + "reward": 3.6630584001541138, + "reward_std": 0.09967895410954952, + "rewards/final_reward": 1.5553507476605395, + "rewards/mask_iou_reward": 0.7776753738302697, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6630585193634033, + "rewards/thk_ans_format_reward": 1.0, + "step": 738, + "think_completion_length": 48.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.484375, + "epoch": 1.2478920741989883, + "grad_norm": 5.12332748634796, + "kl": 0.42578125, + "learning_rate": 7.507588532883642e-07, + "loss": 0.0004, + "reward": 2.5206953287124634, + "reward_std": 0.4001058042049408, + "rewards/final_reward": 0.29060709585459255, + "rewards/mask_iou_reward": 0.14530354792729627, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5206954479217529, + "rewards/thk_ans_format_reward": 1.0, + "step": 739, + "think_completion_length": 53.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.765625, + "epoch": 1.2495784148397977, + "grad_norm": 9.308612277526414, + "kl": 0.5, + "learning_rate": 7.504215851602024e-07, + "loss": 0.0005, + "reward": 3.1267716884613037, + "reward_std": 0.14480041339993477, + "rewards/final_reward": 1.2190345270701473, + "rewards/mask_iou_reward": 0.6095172635350736, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1267716884613037, + "rewards/thk_ans_format_reward": 1.0, + "step": 740, + "think_completion_length": 42.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.15625, + "epoch": 1.2512647554806071, + "grad_norm": 5.86388460967559, + "kl": 0.453125, + "learning_rate": 7.500843170320405e-07, + "loss": 0.0005, + "reward": 3.2177951335906982, + "reward_std": 0.13633359596133232, + "rewards/final_reward": 1.0138802716734372, + "rewards/mask_iou_reward": 0.5069401358367186, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.217795193195343, + "rewards/thk_ans_format_reward": 1.0, + "step": 741, + "think_completion_length": 48.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.296875, + "epoch": 1.2529510961214165, + "grad_norm": 7.697601454438822, + "kl": 0.46484375, + "learning_rate": 7.497470489038785e-07, + "loss": 0.0005, + "reward": 3.3932933807373047, + "reward_std": 0.16536729037761688, + "rewards/final_reward": 1.2362493299776005, + "rewards/mask_iou_reward": 0.6181246649888003, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3932933807373047, + "rewards/thk_ans_format_reward": 1.0, + "step": 742, + "think_completion_length": 49.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.03125, + "epoch": 1.254637436762226, + "grad_norm": 7.090758479273383, + "kl": 0.3974609375, + "learning_rate": 7.494097807757167e-07, + "loss": 0.0004, + "reward": 3.7598599195480347, + "reward_std": 0.18115262687206268, + "rewards/final_reward": 1.8922707127451575, + "rewards/mask_iou_reward": 0.9461353563725787, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7598599791526794, + "rewards/thk_ans_format_reward": 1.0, + "step": 743, + "think_completion_length": 59.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.765625, + "epoch": 1.2563237774030354, + "grad_norm": 11.742361206526029, + "kl": 0.3642578125, + "learning_rate": 7.490725126475548e-07, + "loss": 0.0004, + "reward": 3.0096986293792725, + "reward_std": 0.3475239537656307, + "rewards/final_reward": 0.841675245480936, + "rewards/mask_iou_reward": 0.420837622740468, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.02532359957695, + "rewards/thk_ans_format_reward": 1.0, + "step": 744, + "think_completion_length": 53.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.453125, + "epoch": 1.258010118043845, + "grad_norm": 3.906933246720136, + "kl": 0.4375, + "learning_rate": 7.487352445193928e-07, + "loss": 0.0004, + "reward": 3.715674877166748, + "reward_std": 0.11384453624486923, + "rewards/final_reward": 1.829332156329305, + "rewards/mask_iou_reward": 0.9146660781646525, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7156748175621033, + "rewards/thk_ans_format_reward": 1.0, + "step": 745, + "think_completion_length": 45.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.515625, + "epoch": 1.2596964586846542, + "grad_norm": 7.713701865661837, + "kl": 0.400390625, + "learning_rate": 7.48397976391231e-07, + "loss": 0.0004, + "reward": 3.502246141433716, + "reward_std": 0.2445410154759884, + "rewards/final_reward": 1.4493324108565608, + "rewards/mask_iou_reward": 0.7246662054282804, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5022460222244263, + "rewards/thk_ans_format_reward": 1.0, + "step": 746, + "think_completion_length": 39.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.5625, + "epoch": 1.2613827993254638, + "grad_norm": 6.49050141193276, + "kl": 0.494140625, + "learning_rate": 7.480607082630691e-07, + "loss": 0.0005, + "reward": 2.7671353816986084, + "reward_std": 0.26506610214710236, + "rewards/final_reward": 0.72433116542667, + "rewards/mask_iou_reward": 0.362165582713335, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7671354562044144, + "rewards/thk_ans_format_reward": 1.0, + "step": 747, + "think_completion_length": 51.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.28125, + "epoch": 1.2630691399662732, + "grad_norm": 5.370371100753026, + "kl": 0.466796875, + "learning_rate": 7.477234401349072e-07, + "loss": 0.0005, + "reward": 3.3842209577560425, + "reward_std": 0.12122415006160736, + "rewards/final_reward": 1.1605845540527804, + "rewards/mask_iou_reward": 0.5802922770263902, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3842209577560425, + "rewards/thk_ans_format_reward": 1.0, + "step": 748, + "think_completion_length": 46.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.796875, + "epoch": 1.2647554806070826, + "grad_norm": 7.250959167071798, + "kl": 0.416015625, + "learning_rate": 7.473861720067454e-07, + "loss": 0.0004, + "reward": 3.2378557920455933, + "reward_std": 0.13333739154040813, + "rewards/final_reward": 1.5390109431557328, + "rewards/mask_iou_reward": 0.7695054715778664, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2378557920455933, + "rewards/thk_ans_format_reward": 1.0, + "step": 749, + "think_completion_length": 48.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.546875, + "epoch": 1.266441821247892, + "grad_norm": 13.773598736341574, + "kl": 0.3857421875, + "learning_rate": 7.470489038785834e-07, + "loss": 0.0004, + "reward": 2.9623697996139526, + "reward_std": 0.670602947473526, + "rewards/final_reward": 0.949467456779882, + "rewards/mask_iou_reward": 0.474733728389941, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.962369829416275, + "rewards/thk_ans_format_reward": 1.0, + "step": 750, + "think_completion_length": 60.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.578125, + "epoch": 1.2681281618887015, + "grad_norm": 43.82863090143971, + "kl": 0.43359375, + "learning_rate": 7.467116357504215e-07, + "loss": 0.0004, + "reward": 2.5886611938476562, + "reward_std": 0.16707976162433624, + "rewards/final_reward": 0.9058522578390267, + "rewards/mask_iou_reward": 0.45292612891951334, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5886611640453339, + "rewards/thk_ans_format_reward": 1.0, + "step": 751, + "think_completion_length": 48.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.96875, + "epoch": 1.269814502529511, + "grad_norm": 8.009493352613628, + "kl": 0.4248046875, + "learning_rate": 7.463743676222597e-07, + "loss": 0.0004, + "reward": 3.5888274908065796, + "reward_std": 0.14439216628670692, + "rewards/final_reward": 1.6300469074796642, + "rewards/mask_iou_reward": 0.8150234537398321, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5888275504112244, + "rewards/thk_ans_format_reward": 1.0, + "step": 752, + "think_completion_length": 55.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.09375, + "epoch": 1.2715008431703203, + "grad_norm": 13.0951374787943, + "kl": 0.41796875, + "learning_rate": 7.460370994940978e-07, + "loss": 0.0004, + "reward": 3.163659930229187, + "reward_std": 0.2516661137342453, + "rewards/final_reward": 0.6976526266310328, + "rewards/mask_iou_reward": 0.3488263133155164, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1636599600315094, + "rewards/thk_ans_format_reward": 1.0, + "step": 753, + "think_completion_length": 49.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.796875, + "epoch": 1.27318718381113, + "grad_norm": 7.955705248218445, + "kl": 0.4375, + "learning_rate": 7.456998313659358e-07, + "loss": 0.0004, + "reward": 2.9424456357955933, + "reward_std": 0.24202048778533936, + "rewards/final_reward": 1.233259705619929, + "rewards/mask_iou_reward": 0.6166298528099645, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9580706655979156, + "rewards/thk_ans_format_reward": 0.984375, + "step": 754, + "think_completion_length": 46.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.296875, + "epoch": 1.2748735244519394, + "grad_norm": 9.324204820166516, + "kl": 0.6923828125, + "learning_rate": 7.45362563237774e-07, + "loss": 0.0007, + "reward": 3.5302281379699707, + "reward_std": 0.05718242051079869, + "rewards/final_reward": 1.2451604551256392, + "rewards/mask_iou_reward": 0.6225802275628196, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5302280187606812, + "rewards/thk_ans_format_reward": 1.0, + "step": 755, + "think_completion_length": 45.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.625, + "epoch": 1.2765598650927488, + "grad_norm": 5.0297332010064695, + "kl": 0.423828125, + "learning_rate": 7.450252951096121e-07, + "loss": 0.0004, + "reward": 3.3245279788970947, + "reward_std": 0.2012765109539032, + "rewards/final_reward": 1.801916952998812, + "rewards/mask_iou_reward": 0.900958476499406, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3245280385017395, + "rewards/thk_ans_format_reward": 1.0, + "step": 756, + "think_completion_length": 52.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.953125, + "epoch": 1.2782462057335582, + "grad_norm": 9.471865510151117, + "kl": 0.40625, + "learning_rate": 7.446880269814502e-07, + "loss": 0.0004, + "reward": 3.371519088745117, + "reward_std": 0.1286439746618271, + "rewards/final_reward": 0.9775423947239636, + "rewards/mask_iou_reward": 0.4887711973619818, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3715189695358276, + "rewards/thk_ans_format_reward": 1.0, + "step": 757, + "think_completion_length": 52.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.703125, + "epoch": 1.2799325463743676, + "grad_norm": 11.33907667372978, + "kl": 0.455078125, + "learning_rate": 7.443507588532883e-07, + "loss": 0.0005, + "reward": 3.3066617250442505, + "reward_std": 0.1327102743089199, + "rewards/final_reward": 0.9268396180312112, + "rewards/mask_iou_reward": 0.4634198090156056, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3066617250442505, + "rewards/thk_ans_format_reward": 1.0, + "step": 758, + "think_completion_length": 51.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.953125, + "epoch": 1.281618887015177, + "grad_norm": 22.710178443792344, + "kl": 14.5859375, + "learning_rate": 7.440134907251264e-07, + "loss": 0.0146, + "reward": 3.4230951070785522, + "reward_std": 0.08178156521171331, + "rewards/final_reward": 1.6240219194514065, + "rewards/mask_iou_reward": 0.8120109597257033, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4230949878692627, + "rewards/thk_ans_format_reward": 1.0, + "step": 759, + "think_completion_length": 47.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.953125, + "epoch": 1.2833052276559864, + "grad_norm": 7.669965203607217, + "kl": 0.4013671875, + "learning_rate": 7.436762225969646e-07, + "loss": 0.0004, + "reward": 3.1363537311553955, + "reward_std": 0.328082337975502, + "rewards/final_reward": 0.8638903050486981, + "rewards/mask_iou_reward": 0.43194515252434906, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.1519787907600403, + "rewards/thk_ans_format_reward": 1.0, + "step": 760, + "think_completion_length": 54.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.453125, + "epoch": 1.284991568296796, + "grad_norm": 3.83970479145092, + "kl": 0.396484375, + "learning_rate": 7.433389544688027e-07, + "loss": 0.0004, + "reward": 2.9037578105926514, + "reward_std": 0.2324867658317089, + "rewards/final_reward": 0.9024056875965667, + "rewards/mask_iou_reward": 0.45120284379828335, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.9350077509880066, + "rewards/thk_ans_format_reward": 0.984375, + "step": 761, + "think_completion_length": 50.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.6875, + "epoch": 1.2866779089376055, + "grad_norm": 4.220985191384563, + "kl": 0.421875, + "learning_rate": 7.430016863406408e-07, + "loss": 0.0004, + "reward": 3.437883973121643, + "reward_std": 0.4040543884038925, + "rewards/final_reward": 1.551179383404998, + "rewards/mask_iou_reward": 0.775589691702499, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.4691339135169983, + "rewards/thk_ans_format_reward": 0.984375, + "step": 762, + "think_completion_length": 47.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.59375, + "epoch": 1.2883642495784149, + "grad_norm": 4.057618415959299, + "kl": 0.44140625, + "learning_rate": 7.42664418212479e-07, + "loss": 0.0004, + "reward": 3.699872136116028, + "reward_std": 0.17768453806638718, + "rewards/final_reward": 1.8048597321254096, + "rewards/mask_iou_reward": 0.9024298660627048, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.699872076511383, + "rewards/thk_ans_format_reward": 1.0, + "step": 763, + "think_completion_length": 46.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.203125, + "epoch": 1.2900505902192243, + "grad_norm": 7.307889009036356, + "kl": 0.40234375, + "learning_rate": 7.42327150084317e-07, + "loss": 0.0004, + "reward": 2.904768705368042, + "reward_std": 0.4192696511745453, + "rewards/final_reward": 0.9978510225487277, + "rewards/mask_iou_reward": 0.49892551127436385, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.904768705368042, + "rewards/thk_ans_format_reward": 1.0, + "step": 764, + "think_completion_length": 57.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.390625, + "epoch": 1.2917369308600337, + "grad_norm": 3.8367155882101893, + "kl": 0.3955078125, + "learning_rate": 7.419898819561551e-07, + "loss": 0.0004, + "reward": 2.8194090127944946, + "reward_std": 0.10093265399336815, + "rewards/final_reward": 1.1600728340864759, + "rewards/mask_iou_reward": 0.5800364170432379, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8194091022014618, + "rewards/thk_ans_format_reward": 1.0, + "step": 765, + "think_completion_length": 52.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.65625, + "epoch": 1.2934232715008431, + "grad_norm": 57.108509817729846, + "kl": 0.482421875, + "learning_rate": 7.416526138279933e-07, + "loss": 0.0005, + "reward": 3.300451397895813, + "reward_std": 0.10350893437862396, + "rewards/final_reward": 1.2288283409460157, + "rewards/mask_iou_reward": 0.6144141704730078, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.300451636314392, + "rewards/thk_ans_format_reward": 1.0, + "step": 766, + "think_completion_length": 45.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.0625, + "epoch": 1.2951096121416525, + "grad_norm": 25.825394262271047, + "kl": 0.4189453125, + "learning_rate": 7.413153456998313e-07, + "loss": 0.0004, + "reward": 2.7487651109695435, + "reward_std": 0.17696194536983967, + "rewards/final_reward": 0.6962242875658127, + "rewards/mask_iou_reward": 0.34811214378290634, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7487652003765106, + "rewards/thk_ans_format_reward": 1.0, + "step": 767, + "think_completion_length": 50.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.9375, + "epoch": 1.2967959527824622, + "grad_norm": 7.441897134927553, + "kl": 0.7509765625, + "learning_rate": 7.409780775716694e-07, + "loss": 0.0008, + "reward": 3.0058658123016357, + "reward_std": 0.20971830189228058, + "rewards/final_reward": 0.6484055966150901, + "rewards/mask_iou_reward": 0.32420279830754506, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0058658123016357, + "rewards/thk_ans_format_reward": 1.0, + "step": 768, + "think_completion_length": 44.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.859375, + "epoch": 1.2984822934232714, + "grad_norm": 11.188822729031108, + "kl": 0.388671875, + "learning_rate": 7.406408094435076e-07, + "loss": 0.0004, + "reward": 2.955198287963867, + "reward_std": 0.25020837038755417, + "rewards/final_reward": 1.4167753304898385, + "rewards/mask_iou_reward": 0.7083876652449193, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9551981687545776, + "rewards/thk_ans_format_reward": 1.0, + "step": 769, + "think_completion_length": 48.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.578125, + "epoch": 1.300168634064081, + "grad_norm": 6.824503029956907, + "kl": 0.3935546875, + "learning_rate": 7.403035413153457e-07, + "loss": 0.0004, + "reward": 2.642868399620056, + "reward_std": 0.1783033236861229, + "rewards/final_reward": 0.2176228862671831, + "rewards/mask_iou_reward": 0.10881144313359155, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6428685784339905, + "rewards/thk_ans_format_reward": 1.0, + "step": 770, + "think_completion_length": 49.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.625, + "epoch": 1.3018549747048904, + "grad_norm": 20.721620825901006, + "kl": 0.3740234375, + "learning_rate": 7.399662731871838e-07, + "loss": 0.0004, + "reward": 3.0567421913146973, + "reward_std": 0.07508281245827675, + "rewards/final_reward": 1.1172808054995134, + "rewards/mask_iou_reward": 0.5586404027497567, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0567420572042465, + "rewards/thk_ans_format_reward": 1.0, + "step": 771, + "think_completion_length": 41.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.0625, + "epoch": 1.3035413153456998, + "grad_norm": 25.55148365834708, + "kl": 0.376953125, + "learning_rate": 7.39629005059022e-07, + "loss": 0.0004, + "reward": 2.609902501106262, + "reward_std": 0.3896046429872513, + "rewards/final_reward": 1.0064766722582872, + "rewards/mask_iou_reward": 0.5032383361291436, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6099025905132294, + "rewards/thk_ans_format_reward": 1.0, + "step": 772, + "think_completion_length": 54.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.9375, + "epoch": 1.3052276559865092, + "grad_norm": 4.538825899958136, + "kl": 1.71484375, + "learning_rate": 7.3929173693086e-07, + "loss": 0.0017, + "reward": 3.3087058067321777, + "reward_std": 0.14748084964230657, + "rewards/final_reward": 1.1135952740579675, + "rewards/mask_iou_reward": 0.5567976370289838, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.308705747127533, + "rewards/thk_ans_format_reward": 1.0, + "step": 773, + "think_completion_length": 47.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.03125, + "epoch": 1.3069139966273187, + "grad_norm": 5.948882090803801, + "kl": 0.419921875, + "learning_rate": 7.389544688026981e-07, + "loss": 0.0004, + "reward": 3.3070883750915527, + "reward_std": 0.16637492179870605, + "rewards/final_reward": 1.4802129145206084, + "rewards/mask_iou_reward": 0.7401064572603042, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3070884346961975, + "rewards/thk_ans_format_reward": 1.0, + "step": 774, + "think_completion_length": 47.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.5625, + "epoch": 1.3086003372681283, + "grad_norm": 11.679887672509015, + "kl": 0.4072265625, + "learning_rate": 7.386172006745362e-07, + "loss": 0.0004, + "reward": 3.3442749977111816, + "reward_std": 0.19580984860658646, + "rewards/final_reward": 1.843980964493185, + "rewards/mask_iou_reward": 0.9219904822465925, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3442749977111816, + "rewards/thk_ans_format_reward": 1.0, + "step": 775, + "think_completion_length": 49.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.6875, + "epoch": 1.3102866779089375, + "grad_norm": 7.511563914848423, + "kl": 0.47265625, + "learning_rate": 7.382799325463743e-07, + "loss": 0.0005, + "reward": 3.577596068382263, + "reward_std": 0.11906663700938225, + "rewards/final_reward": 1.8184439123146532, + "rewards/mask_iou_reward": 0.9092219561573266, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.577596127986908, + "rewards/thk_ans_format_reward": 1.0, + "step": 776, + "think_completion_length": 43.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.828125, + "epoch": 1.3119730185497471, + "grad_norm": 6.771204942577976, + "kl": 0.390625, + "learning_rate": 7.379426644182124e-07, + "loss": 0.0004, + "reward": 3.6653069257736206, + "reward_std": 0.017012731172144413, + "rewards/final_reward": 1.5131021459032592, + "rewards/mask_iou_reward": 0.7565510729516296, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6653068661689758, + "rewards/thk_ans_format_reward": 1.0, + "step": 777, + "think_completion_length": 43.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.921875, + "epoch": 1.3136593591905565, + "grad_norm": 11.832163253877466, + "kl": 0.375, + "learning_rate": 7.376053962900506e-07, + "loss": 0.0004, + "reward": 2.6942551136016846, + "reward_std": 0.13270641304552555, + "rewards/final_reward": 1.1430431598691082, + "rewards/mask_iou_reward": 0.5715215799345541, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6942551732063293, + "rewards/thk_ans_format_reward": 1.0, + "step": 778, + "think_completion_length": 51.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.359375, + "epoch": 1.315345699831366, + "grad_norm": 4.538483574655835, + "kl": 0.388671875, + "learning_rate": 7.372681281618887e-07, + "loss": 0.0004, + "reward": 2.9698901176452637, + "reward_std": 0.35718169808387756, + "rewards/final_reward": 1.0378266910704492, + "rewards/mask_iou_reward": 0.5189133455352246, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9698899984359741, + "rewards/thk_ans_format_reward": 1.0, + "step": 779, + "think_completion_length": 47.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.828125, + "epoch": 1.3170320404721754, + "grad_norm": 5.312250952721113, + "kl": 0.4326171875, + "learning_rate": 7.369308600337268e-07, + "loss": 0.0004, + "reward": 2.9204649925231934, + "reward_std": 0.2078213393688202, + "rewards/final_reward": 0.902995751127586, + "rewards/mask_iou_reward": 0.451497875563793, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9204651415348053, + "rewards/thk_ans_format_reward": 1.0, + "step": 780, + "think_completion_length": 48.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.703125, + "epoch": 1.3187183811129848, + "grad_norm": 7.112756534834644, + "kl": 0.486328125, + "learning_rate": 7.365935919055649e-07, + "loss": 0.0005, + "reward": 3.3957865238189697, + "reward_std": 0.29513096809387207, + "rewards/final_reward": 1.289510311648049, + "rewards/mask_iou_reward": 0.6447551558240245, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4114114046096802, + "rewards/thk_ans_format_reward": 0.984375, + "step": 781, + "think_completion_length": 51.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.046875, + "epoch": 1.3204047217537942, + "grad_norm": 5.092965912570209, + "kl": 0.3681640625, + "learning_rate": 7.36256323777403e-07, + "loss": 0.0004, + "reward": 2.766505479812622, + "reward_std": 0.17994603142142296, + "rewards/final_reward": 1.114762105762541, + "rewards/mask_iou_reward": 0.5573810528812705, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7665055394172668, + "rewards/thk_ans_format_reward": 1.0, + "step": 782, + "think_completion_length": 48.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.9375, + "epoch": 1.3220910623946036, + "grad_norm": 15.032888120622165, + "kl": 0.4228515625, + "learning_rate": 7.35919055649241e-07, + "loss": 0.0004, + "reward": 2.8511098623275757, + "reward_std": 0.2713186927139759, + "rewards/final_reward": 0.51069635454804, + "rewards/mask_iou_reward": 0.25534817727402, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8511098623275757, + "rewards/thk_ans_format_reward": 1.0, + "step": 783, + "think_completion_length": 43.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.5, + "epoch": 1.3237774030354132, + "grad_norm": 10.726585243024278, + "kl": 0.388671875, + "learning_rate": 7.355817875210792e-07, + "loss": 0.0004, + "reward": 3.1994192600250244, + "reward_std": 0.07405038690194488, + "rewards/final_reward": 1.661394365132652, + "rewards/mask_iou_reward": 0.830697182566326, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1994193196296692, + "rewards/thk_ans_format_reward": 1.0, + "step": 784, + "think_completion_length": 42.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.921875, + "epoch": 1.3254637436762227, + "grad_norm": 6.017124048186462, + "kl": 1.078125, + "learning_rate": 7.352445193929173e-07, + "loss": 0.0011, + "reward": 3.557616353034973, + "reward_std": 0.17845550179481506, + "rewards/final_reward": 1.5886097221131004, + "rewards/mask_iou_reward": 0.7943048610565502, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5576163530349731, + "rewards/thk_ans_format_reward": 1.0, + "step": 785, + "think_completion_length": 46.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.140625, + "epoch": 1.327150084317032, + "grad_norm": 9.94888675204662, + "kl": 0.48046875, + "learning_rate": 7.349072512647555e-07, + "loss": 0.0005, + "reward": 3.732688069343567, + "reward_std": 0.053104594349861145, + "rewards/final_reward": 1.82699322983769, + "rewards/mask_iou_reward": 0.913496614918845, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7326880097389221, + "rewards/thk_ans_format_reward": 1.0, + "step": 786, + "think_completion_length": 48.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.96875, + "epoch": 1.3288364249578415, + "grad_norm": 5.103127094843979, + "kl": 0.4130859375, + "learning_rate": 7.345699831365936e-07, + "loss": 0.0004, + "reward": 3.539349317550659, + "reward_std": 0.08835931494832039, + "rewards/final_reward": 1.3966989172543376, + "rewards/mask_iou_reward": 0.6983494586271688, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5393492579460144, + "rewards/thk_ans_format_reward": 1.0, + "step": 787, + "think_completion_length": 45.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.53125, + "epoch": 1.330522765598651, + "grad_norm": 4.8496798516590705, + "kl": 0.4423828125, + "learning_rate": 7.342327150084317e-07, + "loss": 0.0004, + "reward": 2.9521323442459106, + "reward_std": 0.26333199441432953, + "rewards/final_reward": 0.4716661765206818, + "rewards/mask_iou_reward": 0.2358330882603409, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9521324634552002, + "rewards/thk_ans_format_reward": 1.0, + "step": 788, + "think_completion_length": 50.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.28125, + "epoch": 1.3322091062394603, + "grad_norm": 19.90609841032805, + "kl": 0.4208984375, + "learning_rate": 7.338954468802699e-07, + "loss": 0.0004, + "reward": 2.9552639722824097, + "reward_std": 0.030708997743204236, + "rewards/final_reward": 0.9783795007841563, + "rewards/mask_iou_reward": 0.4891897503920781, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9552639275789261, + "rewards/thk_ans_format_reward": 1.0, + "step": 789, + "think_completion_length": 45.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.640625, + "epoch": 1.3338954468802697, + "grad_norm": 7.037305808313017, + "kl": 0.431640625, + "learning_rate": 7.335581787521079e-07, + "loss": 0.0004, + "reward": 3.3982867002487183, + "reward_std": 0.17157932370901108, + "rewards/final_reward": 1.3373675370522078, + "rewards/mask_iou_reward": 0.6686837685261039, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3982867002487183, + "rewards/thk_ans_format_reward": 1.0, + "step": 790, + "think_completion_length": 49.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.515625, + "epoch": 1.3355817875210794, + "grad_norm": 14.637715513326066, + "kl": 0.4501953125, + "learning_rate": 7.332209106239459e-07, + "loss": 0.0005, + "reward": 3.3809139728546143, + "reward_std": 0.3167175129055977, + "rewards/final_reward": 1.8598589636448288, + "rewards/mask_iou_reward": 0.9299294818224144, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3809138536453247, + "rewards/thk_ans_format_reward": 1.0, + "step": 791, + "think_completion_length": 48.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.328125, + "epoch": 1.3372681281618888, + "grad_norm": 63.794113836109965, + "kl": 0.4267578125, + "learning_rate": 7.328836424957841e-07, + "loss": 0.0004, + "reward": 3.1712608337402344, + "reward_std": 0.029415050521492958, + "rewards/final_reward": 0.8930833080697248, + "rewards/mask_iou_reward": 0.4465416540348624, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1712608337402344, + "rewards/thk_ans_format_reward": 1.0, + "step": 792, + "think_completion_length": 49.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.15625, + "epoch": 1.3389544688026982, + "grad_norm": 7.031832709390953, + "kl": 0.705078125, + "learning_rate": 7.325463743676222e-07, + "loss": 0.0007, + "reward": 3.364739775657654, + "reward_std": 0.13826466910541058, + "rewards/final_reward": 1.8511947276274574, + "rewards/mask_iou_reward": 0.9255973638137287, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3647398352622986, + "rewards/thk_ans_format_reward": 1.0, + "step": 793, + "think_completion_length": 54.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.390625, + "epoch": 1.3406408094435076, + "grad_norm": 4.754955191343839, + "kl": 0.4521484375, + "learning_rate": 7.322091062394603e-07, + "loss": 0.0005, + "reward": 3.746224522590637, + "reward_std": 0.06749487156048417, + "rewards/final_reward": 1.532440120591791, + "rewards/mask_iou_reward": 0.7662200602958955, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7462245225906372, + "rewards/thk_ans_format_reward": 1.0, + "step": 794, + "think_completion_length": 49.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.421875, + "epoch": 1.342327150084317, + "grad_norm": 6.023167821978929, + "kl": 0.4140625, + "learning_rate": 7.318718381112985e-07, + "loss": 0.0004, + "reward": 3.211634874343872, + "reward_std": 0.1754566803574562, + "rewards/final_reward": 1.3337678355455997, + "rewards/mask_iou_reward": 0.6668839177727999, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2116347551345825, + "rewards/thk_ans_format_reward": 1.0, + "step": 795, + "think_completion_length": 46.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.421875, + "epoch": 1.3440134907251264, + "grad_norm": 8.1196912130005, + "kl": 0.41015625, + "learning_rate": 7.315345699831366e-07, + "loss": 0.0004, + "reward": 3.1976126432418823, + "reward_std": 0.3106941059231758, + "rewards/final_reward": 0.861060513926105, + "rewards/mask_iou_reward": 0.4305302569630525, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.197612702846527, + "rewards/thk_ans_format_reward": 1.0, + "step": 796, + "think_completion_length": 51.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.8125, + "epoch": 1.3456998313659359, + "grad_norm": 21.674449037201075, + "kl": 0.455078125, + "learning_rate": 7.311973018549747e-07, + "loss": 0.0005, + "reward": 3.1202865839004517, + "reward_std": 0.21427929773926735, + "rewards/final_reward": 1.0092933340170596, + "rewards/mask_iou_reward": 0.5046466670085298, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1202866435050964, + "rewards/thk_ans_format_reward": 1.0, + "step": 797, + "think_completion_length": 50.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.4375, + "epoch": 1.3473861720067455, + "grad_norm": 9.802205092093088, + "kl": 0.421875, + "learning_rate": 7.308600337268129e-07, + "loss": 0.0004, + "reward": 2.962410807609558, + "reward_std": 0.2575262784957886, + "rewards/final_reward": 0.8481624548206849, + "rewards/mask_iou_reward": 0.42408122741034243, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9624108374118805, + "rewards/thk_ans_format_reward": 1.0, + "step": 798, + "think_completion_length": 59.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.5, + "epoch": 1.3490725126475547, + "grad_norm": 8.383808482465419, + "kl": 0.4482421875, + "learning_rate": 7.305227655986509e-07, + "loss": 0.0004, + "reward": 3.0775705575942993, + "reward_std": 0.25188253819942474, + "rewards/final_reward": 1.043739117647533, + "rewards/mask_iou_reward": 0.5218695588237665, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.0931956470012665, + "rewards/thk_ans_format_reward": 1.0, + "step": 799, + "think_completion_length": 51.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.9375, + "epoch": 1.3507588532883643, + "grad_norm": 9.518272268526802, + "kl": 0.3837890625, + "learning_rate": 7.301854974704889e-07, + "loss": 0.0004, + "reward": 3.0064759254455566, + "reward_std": 0.4474050849676132, + "rewards/final_reward": 0.5596855725569843, + "rewards/mask_iou_reward": 0.27984278627849213, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.006475806236267, + "rewards/thk_ans_format_reward": 1.0, + "step": 800, + "think_completion_length": 54.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.65625, + "epoch": 1.3524451939291737, + "grad_norm": 6.1880658285669785, + "kl": 0.4833984375, + "learning_rate": 7.298482293423271e-07, + "loss": 0.0005, + "reward": 3.1219156980514526, + "reward_std": 0.30353303998708725, + "rewards/final_reward": 1.1040391526657047, + "rewards/mask_iou_reward": 0.5520195763328524, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.121915653347969, + "rewards/thk_ans_format_reward": 1.0, + "step": 801, + "think_completion_length": 53.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.546875, + "epoch": 1.3541315345699831, + "grad_norm": 9.625310119019451, + "kl": 0.4765625, + "learning_rate": 7.295109612141652e-07, + "loss": 0.0005, + "reward": 3.4418479204177856, + "reward_std": 0.22188640385866165, + "rewards/final_reward": 1.8515527330361707, + "rewards/mask_iou_reward": 0.9257763665180854, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4418478608131409, + "rewards/thk_ans_format_reward": 1.0, + "step": 802, + "think_completion_length": 47.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.28125, + "epoch": 1.3558178752107926, + "grad_norm": 8.513931375294348, + "kl": 0.4482421875, + "learning_rate": 7.291736930860033e-07, + "loss": 0.0004, + "reward": 3.2668780088424683, + "reward_std": 0.17149719037115574, + "rewards/final_reward": 1.26499462085288, + "rewards/mask_iou_reward": 0.63249731042644, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.266878068447113, + "rewards/thk_ans_format_reward": 1.0, + "step": 803, + "think_completion_length": 53.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.09375, + "epoch": 1.357504215851602, + "grad_norm": 10.137968549400595, + "kl": 0.4609375, + "learning_rate": 7.288364249578415e-07, + "loss": 0.0005, + "reward": 3.2986963987350464, + "reward_std": 0.28296051174402237, + "rewards/final_reward": 1.6626159487723857, + "rewards/mask_iou_reward": 0.8313079743861929, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.3143212795257568, + "rewards/thk_ans_format_reward": 1.0, + "step": 804, + "think_completion_length": 50.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.546875, + "epoch": 1.3591905564924116, + "grad_norm": 6.621797528995769, + "kl": 0.4189453125, + "learning_rate": 7.284991568296796e-07, + "loss": 0.0004, + "reward": 3.6139891147613525, + "reward_std": 0.3693799674510956, + "rewards/final_reward": 1.80038448323186, + "rewards/mask_iou_reward": 0.90019224161593, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6139891147613525, + "rewards/thk_ans_format_reward": 1.0, + "step": 805, + "think_completion_length": 56.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.671875, + "epoch": 1.3608768971332208, + "grad_norm": 42.489537876572705, + "kl": 0.439453125, + "learning_rate": 7.281618887015177e-07, + "loss": 0.0005, + "reward": 3.02545964717865, + "reward_std": 0.10977564379572868, + "rewards/final_reward": 1.0804780233200864, + "rewards/mask_iou_reward": 0.5402390116600432, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.04108464717865, + "rewards/thk_ans_format_reward": 1.0, + "step": 806, + "think_completion_length": 56.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.75, + "epoch": 1.3625632377740304, + "grad_norm": 13.881123412247668, + "kl": 0.4365234375, + "learning_rate": 7.278246205733559e-07, + "loss": 0.0004, + "reward": 3.5061166286468506, + "reward_std": 0.2770638167858124, + "rewards/final_reward": 1.5442817771756299, + "rewards/mask_iou_reward": 0.7721408885878149, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5061166882514954, + "rewards/thk_ans_format_reward": 1.0, + "step": 807, + "think_completion_length": 63.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.53125, + "epoch": 1.3642495784148398, + "grad_norm": 44.01742029034989, + "kl": 0.455078125, + "learning_rate": 7.274873524451938e-07, + "loss": 0.0005, + "reward": 3.0958797931671143, + "reward_std": 0.06491614319384098, + "rewards/final_reward": 1.5420486778303586, + "rewards/mask_iou_reward": 0.7710243389151793, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.095879852771759, + "rewards/thk_ans_format_reward": 1.0, + "step": 808, + "think_completion_length": 54.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.375, + "epoch": 1.3659359190556493, + "grad_norm": 7.506006853834921, + "kl": 0.4501953125, + "learning_rate": 7.271500843170319e-07, + "loss": 0.0005, + "reward": 3.369332194328308, + "reward_std": 0.3005400598049164, + "rewards/final_reward": 1.4333333109147426, + "rewards/mask_iou_reward": 0.7166666554573713, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3693323731422424, + "rewards/thk_ans_format_reward": 1.0, + "step": 809, + "think_completion_length": 50.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.09375, + "epoch": 1.3676222596964587, + "grad_norm": 5.096555219506506, + "kl": 0.4072265625, + "learning_rate": 7.268128161888701e-07, + "loss": 0.0004, + "reward": 3.2549372911453247, + "reward_std": 0.11136971414089203, + "rewards/final_reward": 1.3701246547199526, + "rewards/mask_iou_reward": 0.6850623273599763, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.25493723154068, + "rewards/thk_ans_format_reward": 1.0, + "step": 810, + "think_completion_length": 52.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.34375, + "epoch": 1.369308600337268, + "grad_norm": 4.165347153065698, + "kl": 0.4443359375, + "learning_rate": 7.264755480607082e-07, + "loss": 0.0004, + "reward": 3.306629776954651, + "reward_std": 0.20777817629277706, + "rewards/final_reward": 1.4866853816181873, + "rewards/mask_iou_reward": 0.7433426908090937, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.306629717350006, + "rewards/thk_ans_format_reward": 1.0, + "step": 811, + "think_completion_length": 51.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.4375, + "epoch": 1.3709949409780775, + "grad_norm": 16.28929272778552, + "kl": 0.427734375, + "learning_rate": 7.261382799325464e-07, + "loss": 0.0004, + "reward": 3.4713690280914307, + "reward_std": 0.3839820772409439, + "rewards/final_reward": 1.678402783229804, + "rewards/mask_iou_reward": 0.839201391614902, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 1.5026191473007202, + "rewards/thk_ans_format_reward": 1.0, + "step": 812, + "think_completion_length": 54.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.15625, + "epoch": 1.372681281618887, + "grad_norm": 7.969905817192309, + "kl": 0.392578125, + "learning_rate": 7.258010118043845e-07, + "loss": 0.0004, + "reward": 3.7847758531570435, + "reward_std": 0.1786189409904182, + "rewards/final_reward": 1.7245267628009748, + "rewards/mask_iou_reward": 0.8622633814004874, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.784775972366333, + "rewards/thk_ans_format_reward": 1.0, + "step": 813, + "think_completion_length": 50.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.515625, + "epoch": 1.3743676222596966, + "grad_norm": 4.368367320540565, + "kl": 0.4111328125, + "learning_rate": 7.254637436762226e-07, + "loss": 0.0004, + "reward": 3.223156690597534, + "reward_std": 0.10445614764466882, + "rewards/final_reward": 0.6421020966983467, + "rewards/mask_iou_reward": 0.32105104834917336, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.2387816905975342, + "rewards/thk_ans_format_reward": 1.0, + "step": 814, + "think_completion_length": 58.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.359375, + "epoch": 1.376053962900506, + "grad_norm": 5.653449443697027, + "kl": 0.455078125, + "learning_rate": 7.251264755480608e-07, + "loss": 0.0005, + "reward": 3.348850727081299, + "reward_std": 0.04212654661387205, + "rewards/final_reward": 1.0661254905194237, + "rewards/mask_iou_reward": 0.5330627452597119, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3488507866859436, + "rewards/thk_ans_format_reward": 1.0, + "step": 815, + "think_completion_length": 52.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.921875, + "epoch": 1.3777403035413154, + "grad_norm": 6.96340082616904, + "kl": 0.4375, + "learning_rate": 7.247892074198987e-07, + "loss": 0.0004, + "reward": 3.5712579488754272, + "reward_std": 0.2067468911409378, + "rewards/final_reward": 1.284450105073941, + "rewards/mask_iou_reward": 0.6422250525369705, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5712580680847168, + "rewards/thk_ans_format_reward": 1.0, + "step": 816, + "think_completion_length": 58.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.359375, + "epoch": 1.3794266441821248, + "grad_norm": 5.219801172976387, + "kl": 0.396484375, + "learning_rate": 7.244519392917368e-07, + "loss": 0.0004, + "reward": 3.140303611755371, + "reward_std": 0.13667535781860352, + "rewards/final_reward": 0.8110110717477675, + "rewards/mask_iou_reward": 0.40550553587388377, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.140303611755371, + "rewards/thk_ans_format_reward": 1.0, + "step": 817, + "think_completion_length": 60.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.609375, + "epoch": 1.3811129848229342, + "grad_norm": 4.155579350217279, + "kl": 0.3583984375, + "learning_rate": 7.24114671163575e-07, + "loss": 0.0004, + "reward": 2.7823375463485718, + "reward_std": 0.2525832876563072, + "rewards/final_reward": 0.6301977560473451, + "rewards/mask_iou_reward": 0.31509887802367254, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.782337486743927, + "rewards/thk_ans_format_reward": 1.0, + "step": 818, + "think_completion_length": 53.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.625, + "epoch": 1.3827993254637436, + "grad_norm": 16.17353535009975, + "kl": 0.4296875, + "learning_rate": 7.237774030354131e-07, + "loss": 0.0004, + "reward": 3.2859296798706055, + "reward_std": 0.0780464205890894, + "rewards/final_reward": 1.156685700621454, + "rewards/mask_iou_reward": 0.578342850310727, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.285929560661316, + "rewards/thk_ans_format_reward": 1.0, + "step": 819, + "think_completion_length": 53.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.90625, + "epoch": 1.384485666104553, + "grad_norm": 6.598230146187883, + "kl": 0.408203125, + "learning_rate": 7.234401349072512e-07, + "loss": 0.0004, + "reward": 3.308698892593384, + "reward_std": 0.14432849548757076, + "rewards/final_reward": 0.821684403678177, + "rewards/mask_iou_reward": 0.4108422018390885, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3086988925933838, + "rewards/thk_ans_format_reward": 1.0, + "step": 820, + "think_completion_length": 55.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.84375, + "epoch": 1.3861720067453627, + "grad_norm": 7.929231773635772, + "kl": 0.419921875, + "learning_rate": 7.231028667790894e-07, + "loss": 0.0004, + "reward": 3.2562429904937744, + "reward_std": 0.18748314306139946, + "rewards/final_reward": 1.221749844106246, + "rewards/mask_iou_reward": 0.610874922053123, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2562429904937744, + "rewards/thk_ans_format_reward": 1.0, + "step": 821, + "think_completion_length": 53.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.8125, + "epoch": 1.387858347386172, + "grad_norm": 16.674172381983915, + "kl": 0.455078125, + "learning_rate": 7.227655986509275e-07, + "loss": 0.0005, + "reward": 3.353366732597351, + "reward_std": 0.20042579993605614, + "rewards/final_reward": 0.8192778731739727, + "rewards/mask_iou_reward": 0.40963893658698636, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3533667922019958, + "rewards/thk_ans_format_reward": 1.0, + "step": 822, + "think_completion_length": 60.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.0, + "epoch": 1.3895446880269815, + "grad_norm": 13.233620326708719, + "kl": 0.408203125, + "learning_rate": 7.224283305227656e-07, + "loss": 0.0004, + "reward": 3.7568464279174805, + "reward_std": 0.035045892000198364, + "rewards/final_reward": 1.9271590606246267, + "rewards/mask_iou_reward": 0.9635795303123134, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7568464279174805, + "rewards/thk_ans_format_reward": 1.0, + "step": 823, + "think_completion_length": 58.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.140625, + "epoch": 1.391231028667791, + "grad_norm": 5.457368929175906, + "kl": 0.365234375, + "learning_rate": 7.220910623946038e-07, + "loss": 0.0004, + "reward": 3.212060332298279, + "reward_std": 0.2671176493167877, + "rewards/final_reward": 1.6695634439718066, + "rewards/mask_iou_reward": 0.8347817219859033, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.2276853322982788, + "rewards/thk_ans_format_reward": 1.0, + "step": 824, + "think_completion_length": 54.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.6875, + "epoch": 1.3929173693086003, + "grad_norm": 9.735584218214859, + "kl": 0.474609375, + "learning_rate": 7.217537942664417e-07, + "loss": 0.0005, + "reward": 3.3629668951034546, + "reward_std": 0.20313755422830582, + "rewards/final_reward": 1.441508973487453, + "rewards/mask_iou_reward": 0.7207544867437266, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3629668951034546, + "rewards/thk_ans_format_reward": 1.0, + "step": 825, + "think_completion_length": 55.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.21875, + "epoch": 1.3946037099494097, + "grad_norm": 8.522991047543638, + "kl": 0.4296875, + "learning_rate": 7.214165261382798e-07, + "loss": 0.0004, + "reward": 3.4234044551849365, + "reward_std": 0.40352555364370346, + "rewards/final_reward": 1.0577974036356383, + "rewards/mask_iou_reward": 0.5288987018178192, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4234043955802917, + "rewards/thk_ans_format_reward": 1.0, + "step": 826, + "think_completion_length": 55.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.25, + "epoch": 1.3962900505902192, + "grad_norm": 15.899916744193234, + "kl": 0.4189453125, + "learning_rate": 7.21079258010118e-07, + "loss": 0.0004, + "reward": 3.3997583389282227, + "reward_std": 0.10933668166399002, + "rewards/final_reward": 1.0764809433615938, + "rewards/mask_iou_reward": 0.5382404716807969, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3997583985328674, + "rewards/thk_ans_format_reward": 1.0, + "step": 827, + "think_completion_length": 49.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.734375, + "epoch": 1.3979763912310288, + "grad_norm": 5.387528232664865, + "kl": 0.4287109375, + "learning_rate": 7.207419898819561e-07, + "loss": 0.0004, + "reward": 3.3618130683898926, + "reward_std": 0.2051006779074669, + "rewards/final_reward": 1.1968374017252494, + "rewards/mask_iou_reward": 0.5984187008626247, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3618130087852478, + "rewards/thk_ans_format_reward": 1.0, + "step": 828, + "think_completion_length": 54.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.625, + "epoch": 1.399662731871838, + "grad_norm": 12.898067805148317, + "kl": 0.4130859375, + "learning_rate": 7.204047217537942e-07, + "loss": 0.0004, + "reward": 3.607871890068054, + "reward_std": 0.12004952877759933, + "rewards/final_reward": 1.4956458828207402, + "rewards/mask_iou_reward": 0.7478229414103701, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.607871949672699, + "rewards/thk_ans_format_reward": 1.0, + "step": 829, + "think_completion_length": 52.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.0, + "epoch": 1.4013490725126476, + "grad_norm": 9.22840356229312, + "kl": 0.42578125, + "learning_rate": 7.200674536256324e-07, + "loss": 0.0004, + "reward": 2.9152538776397705, + "reward_std": 0.16578126698732376, + "rewards/final_reward": 1.0610994051270966, + "rewards/mask_iou_reward": 0.5305497025635483, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9152538776397705, + "rewards/thk_ans_format_reward": 1.0, + "step": 830, + "think_completion_length": 54.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.734375, + "epoch": 1.403035413153457, + "grad_norm": 15.11425361744145, + "kl": 0.3671875, + "learning_rate": 7.197301854974705e-07, + "loss": 0.0004, + "reward": 3.1335614919662476, + "reward_std": 0.2536454573273659, + "rewards/final_reward": 1.614278669479924, + "rewards/mask_iou_reward": 0.807139334739962, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1335614323616028, + "rewards/thk_ans_format_reward": 1.0, + "step": 831, + "think_completion_length": 59.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.359375, + "epoch": 1.4047217537942664, + "grad_norm": 13.660412359550188, + "kl": 0.4609375, + "learning_rate": 7.193929173693086e-07, + "loss": 0.0005, + "reward": 3.0477449893951416, + "reward_std": 0.28462807834148407, + "rewards/final_reward": 1.306179247026351, + "rewards/mask_iou_reward": 0.6530896235131755, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.0633699893951416, + "rewards/thk_ans_format_reward": 1.0, + "step": 832, + "think_completion_length": 58.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.609375, + "epoch": 1.4064080944350759, + "grad_norm": 4.762614594252812, + "kl": 0.4189453125, + "learning_rate": 7.190556492411467e-07, + "loss": 0.0004, + "reward": 3.0881171226501465, + "reward_std": 0.2660984881222248, + "rewards/final_reward": 0.7231560324770698, + "rewards/mask_iou_reward": 0.3615780162385349, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0881169438362122, + "rewards/thk_ans_format_reward": 1.0, + "step": 833, + "think_completion_length": 60.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.96875, + "epoch": 1.4080944350758853, + "grad_norm": 157.51508190258878, + "kl": 0.490234375, + "learning_rate": 7.187183811129847e-07, + "loss": 0.0005, + "reward": 3.0769537687301636, + "reward_std": 0.15264162048697472, + "rewards/final_reward": 1.3258674626582714, + "rewards/mask_iou_reward": 0.6629337313291357, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0769538879394531, + "rewards/thk_ans_format_reward": 1.0, + "step": 834, + "think_completion_length": 50.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.484375, + "epoch": 1.4097807757166947, + "grad_norm": 20.709158949497038, + "kl": 0.4609375, + "learning_rate": 7.183811129848229e-07, + "loss": 0.0005, + "reward": 3.406615734100342, + "reward_std": 0.07765450701117516, + "rewards/final_reward": 1.1670628162854402, + "rewards/mask_iou_reward": 0.5835314081427201, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.406615674495697, + "rewards/thk_ans_format_reward": 1.0, + "step": 835, + "think_completion_length": 57.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.359375, + "epoch": 1.411467116357504, + "grad_norm": 14.63598679525673, + "kl": 0.4140625, + "learning_rate": 7.18043844856661e-07, + "loss": 0.0004, + "reward": 3.6451518535614014, + "reward_std": 0.3170605003833771, + "rewards/final_reward": 1.7678322310469072, + "rewards/mask_iou_reward": 0.8839161155234536, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6451519131660461, + "rewards/thk_ans_format_reward": 1.0, + "step": 836, + "think_completion_length": 58.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.8125, + "epoch": 1.4131534569983137, + "grad_norm": 18.364809498373194, + "kl": 0.4267578125, + "learning_rate": 7.177065767284991e-07, + "loss": 0.0004, + "reward": 3.175464391708374, + "reward_std": 0.21890763938426971, + "rewards/final_reward": 1.4261317621170528, + "rewards/mask_iou_reward": 0.7130658810585264, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.175464391708374, + "rewards/thk_ans_format_reward": 1.0, + "step": 837, + "think_completion_length": 59.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.796875, + "epoch": 1.4148397976391232, + "grad_norm": 4.826306279345156, + "kl": 0.4453125, + "learning_rate": 7.173693086003373e-07, + "loss": 0.0004, + "reward": 3.2676570415496826, + "reward_std": 0.07519757375121117, + "rewards/final_reward": 0.8388647625645046, + "rewards/mask_iou_reward": 0.4194323812822523, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2676568925380707, + "rewards/thk_ans_format_reward": 1.0, + "step": 838, + "think_completion_length": 55.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.96875, + "epoch": 1.4165261382799326, + "grad_norm": 4.5560595609149495, + "kl": 0.400390625, + "learning_rate": 7.170320404721754e-07, + "loss": 0.0004, + "reward": 3.2105181217193604, + "reward_std": 0.10261780396103859, + "rewards/final_reward": 0.9419267024011992, + "rewards/mask_iou_reward": 0.4709633512005996, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2105180025100708, + "rewards/thk_ans_format_reward": 1.0, + "step": 839, + "think_completion_length": 55.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.796875, + "epoch": 1.418212478920742, + "grad_norm": 5.034386179760682, + "kl": 0.439453125, + "learning_rate": 7.166947723440135e-07, + "loss": 0.0004, + "reward": 2.976960778236389, + "reward_std": 0.08269162010401487, + "rewards/final_reward": 0.4845235261759516, + "rewards/mask_iou_reward": 0.2422617630879758, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9769608080387115, + "rewards/thk_ans_format_reward": 1.0, + "step": 840, + "think_completion_length": 54.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.09375, + "epoch": 1.4198988195615514, + "grad_norm": 16.873982914654313, + "kl": 0.513671875, + "learning_rate": 7.163575042158516e-07, + "loss": 0.0005, + "reward": 3.330239415168762, + "reward_std": 0.09310411475598812, + "rewards/final_reward": 0.9805123238516863, + "rewards/mask_iou_reward": 0.49025616192584315, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3302394151687622, + "rewards/thk_ans_format_reward": 1.0, + "step": 841, + "think_completion_length": 52.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.578125, + "epoch": 1.4215851602023608, + "grad_norm": 21.973761932873924, + "kl": 0.330078125, + "learning_rate": 7.160202360876897e-07, + "loss": 0.0003, + "reward": 3.7035324573516846, + "reward_std": 0.18692347779870033, + "rewards/final_reward": 1.8463038185252576, + "rewards/mask_iou_reward": 0.9231519092626288, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.7191572785377502, + "rewards/thk_ans_format_reward": 1.0, + "step": 842, + "think_completion_length": 51.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.921875, + "epoch": 1.4232715008431702, + "grad_norm": 3.8783438591055797, + "kl": 0.453125, + "learning_rate": 7.156829679595277e-07, + "loss": 0.0005, + "reward": 3.3667051792144775, + "reward_std": 0.13266583997756243, + "rewards/final_reward": 1.1172658691646342, + "rewards/mask_iou_reward": 0.5586329345823171, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3667052388191223, + "rewards/thk_ans_format_reward": 1.0, + "step": 843, + "think_completion_length": 54.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.5, + "epoch": 1.4249578414839799, + "grad_norm": 5.68908269467362, + "kl": 0.4140625, + "learning_rate": 7.153456998313659e-07, + "loss": 0.0004, + "reward": 3.704403042793274, + "reward_std": 0.12302776426076889, + "rewards/final_reward": 1.7305376594747914, + "rewards/mask_iou_reward": 0.8652688297373957, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.704403042793274, + "rewards/thk_ans_format_reward": 1.0, + "step": 844, + "think_completion_length": 48.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.671875, + "epoch": 1.4266441821247893, + "grad_norm": 7.873233196363575, + "kl": 0.4736328125, + "learning_rate": 7.15008431703204e-07, + "loss": 0.0005, + "reward": 3.1094895601272583, + "reward_std": 0.056312352418899536, + "rewards/final_reward": 0.7561530607928131, + "rewards/mask_iou_reward": 0.37807653039640654, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1094896793365479, + "rewards/thk_ans_format_reward": 1.0, + "step": 845, + "think_completion_length": 58.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.953125, + "epoch": 1.4283305227655987, + "grad_norm": 5.862238379381449, + "kl": 0.4033203125, + "learning_rate": 7.146711635750421e-07, + "loss": 0.0004, + "reward": 2.8694413900375366, + "reward_std": 0.26014316687360406, + "rewards/final_reward": 0.23614428593740902, + "rewards/mask_iou_reward": 0.11807214296870451, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.8850663900375366, + "rewards/thk_ans_format_reward": 1.0, + "step": 846, + "think_completion_length": 55.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.125, + "epoch": 1.430016863406408, + "grad_norm": 97.83723017637423, + "kl": 0.4453125, + "learning_rate": 7.143338954468803e-07, + "loss": 0.0005, + "reward": 2.65997576713562, + "reward_std": 0.33230482041835785, + "rewards/final_reward": 0.5149375789828794, + "rewards/mask_iou_reward": 0.2574687894914397, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6599756479263306, + "rewards/thk_ans_format_reward": 1.0, + "step": 847, + "think_completion_length": 53.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.6875, + "epoch": 1.4317032040472175, + "grad_norm": 15.2524038009595, + "kl": 0.4248046875, + "learning_rate": 7.139966273187184e-07, + "loss": 0.0004, + "reward": 3.162472367286682, + "reward_std": 0.2069089524447918, + "rewards/final_reward": 1.571867908834975, + "rewards/mask_iou_reward": 0.7859339544174875, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1624723076820374, + "rewards/thk_ans_format_reward": 1.0, + "step": 848, + "think_completion_length": 49.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.0625, + "epoch": 1.433389544688027, + "grad_norm": 11.31711407452566, + "kl": 0.509765625, + "learning_rate": 7.136593591905564e-07, + "loss": 0.0005, + "reward": 3.353234648704529, + "reward_std": 0.3007300794124603, + "rewards/final_reward": 1.415233789323357, + "rewards/mask_iou_reward": 0.7076168946616785, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3532347679138184, + "rewards/thk_ans_format_reward": 1.0, + "step": 849, + "think_completion_length": 58.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.5, + "epoch": 1.4350758853288363, + "grad_norm": 8.058228701945565, + "kl": 0.4248046875, + "learning_rate": 7.133220910623946e-07, + "loss": 0.0005, + "reward": 3.258637309074402, + "reward_std": 0.028607182670384645, + "rewards/final_reward": 1.4189250618857554, + "rewards/mask_iou_reward": 0.7094625309428777, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2586371302604675, + "rewards/thk_ans_format_reward": 1.0, + "step": 850, + "think_completion_length": 51.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.734375, + "epoch": 1.436762225969646, + "grad_norm": 9.547448920950147, + "kl": 0.50390625, + "learning_rate": 7.129848229342327e-07, + "loss": 0.0005, + "reward": 2.6284313201904297, + "reward_std": 0.20265497267246246, + "rewards/final_reward": 0.8259624311973379, + "rewards/mask_iou_reward": 0.41298121559866896, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6284312754869461, + "rewards/thk_ans_format_reward": 1.0, + "step": 851, + "think_completion_length": 60.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.6875, + "epoch": 1.4384485666104554, + "grad_norm": 13.224954231135428, + "kl": 0.474609375, + "learning_rate": 7.126475548060707e-07, + "loss": 0.0005, + "reward": 2.9994101524353027, + "reward_std": 0.12460505217313766, + "rewards/final_reward": 1.8945995775852573, + "rewards/mask_iou_reward": 0.9472997887926287, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9994100630283356, + "rewards/thk_ans_format_reward": 1.0, + "step": 852, + "think_completion_length": 53.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.40625, + "epoch": 1.4401349072512648, + "grad_norm": 8.569023489569883, + "kl": 0.458984375, + "learning_rate": 7.123102866779089e-07, + "loss": 0.0005, + "reward": 3.078709602355957, + "reward_std": 0.11330131255090237, + "rewards/final_reward": 0.8808447919914785, + "rewards/mask_iou_reward": 0.44042239599573924, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0787094831466675, + "rewards/thk_ans_format_reward": 1.0, + "step": 853, + "think_completion_length": 48.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.21875, + "epoch": 1.4418212478920742, + "grad_norm": 4.642482646726498, + "kl": 0.521484375, + "learning_rate": 7.11973018549747e-07, + "loss": 0.0005, + "reward": 3.0897263288497925, + "reward_std": 0.2790881544351578, + "rewards/final_reward": 0.9708697595854819, + "rewards/mask_iou_reward": 0.48543487979274097, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0897262394428253, + "rewards/thk_ans_format_reward": 1.0, + "step": 854, + "think_completion_length": 48.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.75, + "epoch": 1.4435075885328836, + "grad_norm": 6.027895450430307, + "kl": 0.41796875, + "learning_rate": 7.116357504215851e-07, + "loss": 0.0004, + "reward": 3.2061294317245483, + "reward_std": 0.1781318113207817, + "rewards/final_reward": 0.8162860726709539, + "rewards/mask_iou_reward": 0.40814303633547694, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.206129550933838, + "rewards/thk_ans_format_reward": 1.0, + "step": 855, + "think_completion_length": 58.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.828125, + "epoch": 1.445193929173693, + "grad_norm": 4.937207974636539, + "kl": 0.4892578125, + "learning_rate": 7.112984822934233e-07, + "loss": 0.0005, + "reward": 3.273247718811035, + "reward_std": 0.44166913628578186, + "rewards/final_reward": 1.2405309436027023, + "rewards/mask_iou_reward": 0.6202654718013512, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2732477188110352, + "rewards/thk_ans_format_reward": 1.0, + "step": 856, + "think_completion_length": 51.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.359375, + "epoch": 1.4468802698145025, + "grad_norm": 4.498629741082507, + "kl": 0.447265625, + "learning_rate": 7.109612141652614e-07, + "loss": 0.0004, + "reward": 3.0108615159988403, + "reward_std": 0.3564150631427765, + "rewards/final_reward": 0.9811074353066257, + "rewards/mask_iou_reward": 0.49055371765331285, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 1.0421114563941956, + "rewards/thk_ans_format_reward": 1.0, + "step": 857, + "think_completion_length": 49.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.9375, + "epoch": 1.448566610455312, + "grad_norm": 40.5484481050642, + "kl": 0.4423828125, + "learning_rate": 7.106239460370994e-07, + "loss": 0.0004, + "reward": 3.179835796356201, + "reward_std": 0.16212757676839828, + "rewards/final_reward": 1.2437849535160206, + "rewards/mask_iou_reward": 0.6218924767580103, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1798357963562012, + "rewards/thk_ans_format_reward": 1.0, + "step": 858, + "think_completion_length": 57.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.21875, + "epoch": 1.4502529510961213, + "grad_norm": 9.542311459129479, + "kl": 0.501953125, + "learning_rate": 7.102866779089376e-07, + "loss": 0.0005, + "reward": 3.281885504722595, + "reward_std": 0.40248236060142517, + "rewards/final_reward": 1.2012127911041488, + "rewards/mask_iou_reward": 0.6006063955520744, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2818855047225952, + "rewards/thk_ans_format_reward": 1.0, + "step": 859, + "think_completion_length": 55.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.84375, + "epoch": 1.451939291736931, + "grad_norm": 4.3575825018615495, + "kl": 0.4375, + "learning_rate": 7.099494097807756e-07, + "loss": 0.0004, + "reward": 3.309635281562805, + "reward_std": 0.275321364402771, + "rewards/final_reward": 1.567287356990522, + "rewards/mask_iou_reward": 0.783643678495261, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.309635192155838, + "rewards/thk_ans_format_reward": 1.0, + "step": 860, + "think_completion_length": 49.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.5, + "epoch": 1.4536256323777403, + "grad_norm": 17.364544031528975, + "kl": 0.4736328125, + "learning_rate": 7.096121416526138e-07, + "loss": 0.0005, + "reward": 3.50772488117218, + "reward_std": 0.06305067986249924, + "rewards/final_reward": 1.5693529211308035, + "rewards/mask_iou_reward": 0.7846764605654017, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5077248811721802, + "rewards/thk_ans_format_reward": 1.0, + "step": 861, + "think_completion_length": 47.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.078125, + "epoch": 1.4553119730185498, + "grad_norm": 9.445189061831712, + "kl": 0.431640625, + "learning_rate": 7.092748735244519e-07, + "loss": 0.0004, + "reward": 3.5429933071136475, + "reward_std": 0.17221157252788544, + "rewards/final_reward": 1.6822896347621752, + "rewards/mask_iou_reward": 0.8411448173810876, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5429933071136475, + "rewards/thk_ans_format_reward": 1.0, + "step": 862, + "think_completion_length": 53.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.875, + "epoch": 1.4569983136593592, + "grad_norm": 59.79920462584689, + "kl": 0.44140625, + "learning_rate": 7.0893760539629e-07, + "loss": 0.0004, + "reward": 3.3259243965148926, + "reward_std": 0.2988605722784996, + "rewards/final_reward": 1.1951013711153204, + "rewards/mask_iou_reward": 0.5975506855576602, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3259243965148926, + "rewards/thk_ans_format_reward": 1.0, + "step": 863, + "think_completion_length": 53.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.375, + "epoch": 1.4586846543001686, + "grad_norm": 10.79964206810661, + "kl": 0.5322265625, + "learning_rate": 7.086003372681282e-07, + "loss": 0.0006, + "reward": 2.977699041366577, + "reward_std": 0.10583911696448922, + "rewards/final_reward": 1.0417962258507818, + "rewards/mask_iou_reward": 0.5208981129253909, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.97769895195961, + "rewards/thk_ans_format_reward": 1.0, + "step": 864, + "think_completion_length": 48.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.59375, + "epoch": 1.460370994940978, + "grad_norm": 18.176945366133, + "kl": 0.4658203125, + "learning_rate": 7.082630691399663e-07, + "loss": 0.0005, + "reward": 3.491799473762512, + "reward_std": 0.06794925779104233, + "rewards/final_reward": 1.199618176053284, + "rewards/mask_iou_reward": 0.599809088026642, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.491799533367157, + "rewards/thk_ans_format_reward": 1.0, + "step": 865, + "think_completion_length": 46.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.6875, + "epoch": 1.4620573355817874, + "grad_norm": 19.21865624105213, + "kl": 0.4833984375, + "learning_rate": 7.079258010118043e-07, + "loss": 0.0005, + "reward": 3.5655884742736816, + "reward_std": 0.22954148054122925, + "rewards/final_reward": 1.7901023068675446, + "rewards/mask_iou_reward": 0.8950511534337723, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5655885338783264, + "rewards/thk_ans_format_reward": 1.0, + "step": 866, + "think_completion_length": 52.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.671875, + "epoch": 1.463743676222597, + "grad_norm": 11.208424920176157, + "kl": 0.443359375, + "learning_rate": 7.075885328836425e-07, + "loss": 0.0004, + "reward": 3.2776330709457397, + "reward_std": 0.1639660745859146, + "rewards/final_reward": 1.0371694517989962, + "rewards/mask_iou_reward": 0.5185847258994981, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2776331305503845, + "rewards/thk_ans_format_reward": 1.0, + "step": 867, + "think_completion_length": 48.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.0625, + "epoch": 1.4654300168634065, + "grad_norm": 11.883223028850797, + "kl": 0.4833984375, + "learning_rate": 7.072512647554806e-07, + "loss": 0.0005, + "reward": 3.4130160808563232, + "reward_std": 0.2615826725959778, + "rewards/final_reward": 1.2130193291082727, + "rewards/mask_iou_reward": 0.6065096645541364, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.413015902042389, + "rewards/thk_ans_format_reward": 1.0, + "step": 868, + "think_completion_length": 54.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.265625, + "epoch": 1.4671163575042159, + "grad_norm": 32.80673354192309, + "kl": 0.44140625, + "learning_rate": 7.069139966273186e-07, + "loss": 0.0004, + "reward": 2.798851251602173, + "reward_std": 0.08375886641442776, + "rewards/final_reward": 0.5250274420052581, + "rewards/mask_iou_reward": 0.26251372100262904, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7988512814044952, + "rewards/thk_ans_format_reward": 1.0, + "step": 869, + "think_completion_length": 54.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.3125, + "epoch": 1.4688026981450253, + "grad_norm": 13.261868917995077, + "kl": 0.462890625, + "learning_rate": 7.065767284991568e-07, + "loss": 0.0005, + "reward": 2.923264503479004, + "reward_std": 0.25585998594760895, + "rewards/final_reward": 0.6537531827199606, + "rewards/mask_iou_reward": 0.3268765913599803, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9232644140720367, + "rewards/thk_ans_format_reward": 1.0, + "step": 870, + "think_completion_length": 54.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.515625, + "epoch": 1.4704890387858347, + "grad_norm": 5.530281082751846, + "kl": 0.4130859375, + "learning_rate": 7.062394603709949e-07, + "loss": 0.0004, + "reward": 3.422648787498474, + "reward_std": 0.2950022518634796, + "rewards/final_reward": 1.0513954006982185, + "rewards/mask_iou_reward": 0.5256977003491092, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.422648847103119, + "rewards/thk_ans_format_reward": 1.0, + "step": 871, + "think_completion_length": 48.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.078125, + "epoch": 1.4721753794266441, + "grad_norm": 7.472701314041896, + "kl": 0.4140625, + "learning_rate": 7.05902192242833e-07, + "loss": 0.0004, + "reward": 3.1202709674835205, + "reward_std": 0.0359388068318367, + "rewards/final_reward": 0.9082897754423601, + "rewards/mask_iou_reward": 0.45414488772118006, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1202709674835205, + "rewards/thk_ans_format_reward": 1.0, + "step": 872, + "think_completion_length": 52.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.453125, + "epoch": 1.4738617200674535, + "grad_norm": 9.384000312254551, + "kl": 0.5283203125, + "learning_rate": 7.055649241146712e-07, + "loss": 0.0005, + "reward": 3.0585156679153442, + "reward_std": 0.1298949345946312, + "rewards/final_reward": 1.1770550045989658, + "rewards/mask_iou_reward": 0.5885275022994829, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0741406977176666, + "rewards/thk_ans_format_reward": 0.984375, + "step": 873, + "think_completion_length": 50.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.359375, + "epoch": 1.4755480607082632, + "grad_norm": 10.702095709503649, + "kl": 0.4658203125, + "learning_rate": 7.052276559865092e-07, + "loss": 0.0005, + "reward": 3.082894802093506, + "reward_std": 0.22249618917703629, + "rewards/final_reward": 1.1370577026434994, + "rewards/mask_iou_reward": 0.5685288513217497, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0828947126865387, + "rewards/thk_ans_format_reward": 1.0, + "step": 874, + "think_completion_length": 50.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.59375, + "epoch": 1.4772344013490726, + "grad_norm": 9.649217015576328, + "kl": 0.443359375, + "learning_rate": 7.048903878583473e-07, + "loss": 0.0004, + "reward": 2.738980293273926, + "reward_std": 0.2116129845380783, + "rewards/final_reward": 0.9144510817754059, + "rewards/mask_iou_reward": 0.45722554088770295, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7389805316925049, + "rewards/thk_ans_format_reward": 1.0, + "step": 875, + "think_completion_length": 60.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.75, + "epoch": 1.478920741989882, + "grad_norm": 6.049179427563422, + "kl": 0.412109375, + "learning_rate": 7.045531197301855e-07, + "loss": 0.0004, + "reward": 3.134316563606262, + "reward_std": 0.2520030327141285, + "rewards/final_reward": 1.0917850139519656, + "rewards/mask_iou_reward": 0.5458925069759828, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1499416530132294, + "rewards/thk_ans_format_reward": 0.984375, + "step": 876, + "think_completion_length": 59.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.703125, + "epoch": 1.4806070826306914, + "grad_norm": 6.690438754177215, + "kl": 0.466796875, + "learning_rate": 7.042158516020236e-07, + "loss": 0.0005, + "reward": 3.6443511247634888, + "reward_std": 0.09915501996874809, + "rewards/final_reward": 1.5526876263785594, + "rewards/mask_iou_reward": 0.7763438131892797, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.644351065158844, + "rewards/thk_ans_format_reward": 1.0, + "step": 877, + "think_completion_length": 56.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.34375, + "epoch": 1.4822934232715008, + "grad_norm": 13.592751851346678, + "kl": 0.52734375, + "learning_rate": 7.038785834738616e-07, + "loss": 0.0005, + "reward": 3.3902156352996826, + "reward_std": 0.13225466385483742, + "rewards/final_reward": 1.526574811986646, + "rewards/mask_iou_reward": 0.763287405993323, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.390215516090393, + "rewards/thk_ans_format_reward": 1.0, + "step": 878, + "think_completion_length": 46.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.296875, + "epoch": 1.4839797639123102, + "grad_norm": 5.0377686056084166, + "kl": 0.4716796875, + "learning_rate": 7.035413153456998e-07, + "loss": 0.0005, + "reward": 3.232161045074463, + "reward_std": 0.12451484799385071, + "rewards/final_reward": 0.8035639148908846, + "rewards/mask_iou_reward": 0.4017819574454423, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2321611642837524, + "rewards/thk_ans_format_reward": 1.0, + "step": 879, + "think_completion_length": 47.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.03125, + "epoch": 1.4856661045531196, + "grad_norm": 5.453689899823007, + "kl": 0.42578125, + "learning_rate": 7.032040472175379e-07, + "loss": 0.0004, + "reward": 3.3886868953704834, + "reward_std": 0.15230849012732506, + "rewards/final_reward": 1.1831722336546733, + "rewards/mask_iou_reward": 0.5915861168273366, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3886866569519043, + "rewards/thk_ans_format_reward": 1.0, + "step": 880, + "think_completion_length": 52.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.71875, + "epoch": 1.4873524451939293, + "grad_norm": 10.350225069844578, + "kl": 0.41015625, + "learning_rate": 7.02866779089376e-07, + "loss": 0.0004, + "reward": 3.2247852087020874, + "reward_std": 0.22761035338044167, + "rewards/final_reward": 1.3909674394845886, + "rewards/mask_iou_reward": 0.6954837197422943, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2247852683067322, + "rewards/thk_ans_format_reward": 1.0, + "step": 881, + "think_completion_length": 56.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.328125, + "epoch": 1.4890387858347387, + "grad_norm": 4.851130451842982, + "kl": 0.4716796875, + "learning_rate": 7.025295109612142e-07, + "loss": 0.0005, + "reward": 3.4963871240615845, + "reward_std": 0.19990779552608728, + "rewards/final_reward": 1.3284091362959223, + "rewards/mask_iou_reward": 0.6642045681479611, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4963870644569397, + "rewards/thk_ans_format_reward": 1.0, + "step": 882, + "think_completion_length": 55.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.421875, + "epoch": 1.4907251264755481, + "grad_norm": 4.651181463731348, + "kl": 0.482421875, + "learning_rate": 7.021922428330522e-07, + "loss": 0.0005, + "reward": 3.3203155994415283, + "reward_std": 0.15234145522117615, + "rewards/final_reward": 0.845040981767172, + "rewards/mask_iou_reward": 0.422520490883586, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.320315659046173, + "rewards/thk_ans_format_reward": 1.0, + "step": 883, + "think_completion_length": 55.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.703125, + "epoch": 1.4924114671163575, + "grad_norm": 5.863150136793525, + "kl": 0.44921875, + "learning_rate": 7.018549747048903e-07, + "loss": 0.0005, + "reward": 3.320623278617859, + "reward_std": 0.1672058179974556, + "rewards/final_reward": 1.8739585309298803, + "rewards/mask_iou_reward": 0.9369792654649401, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3206232190132141, + "rewards/thk_ans_format_reward": 1.0, + "step": 884, + "think_completion_length": 53.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.515625, + "epoch": 1.494097807757167, + "grad_norm": 6.021223525582373, + "kl": 0.5, + "learning_rate": 7.015177065767285e-07, + "loss": 0.0005, + "reward": 3.579367160797119, + "reward_std": 0.35837820172309875, + "rewards/final_reward": 1.5228644932419437, + "rewards/mask_iou_reward": 0.7614322466209719, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.594992220401764, + "rewards/thk_ans_format_reward": 1.0, + "step": 885, + "think_completion_length": 61.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.984375, + "epoch": 1.4957841483979764, + "grad_norm": 6.5531915249452215, + "kl": 0.3818359375, + "learning_rate": 7.011804384485666e-07, + "loss": 0.0004, + "reward": 2.4116973876953125, + "reward_std": 0.24532928317785263, + "rewards/final_reward": 0.09217479847421674, + "rewards/mask_iou_reward": 0.04608739923710837, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.4116973280906677, + "rewards/thk_ans_format_reward": 1.0, + "step": 886, + "think_completion_length": 53.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.21875, + "epoch": 1.4974704890387858, + "grad_norm": 22.897900079081385, + "kl": 0.4697265625, + "learning_rate": 7.008431703204047e-07, + "loss": 0.0005, + "reward": 3.2448573112487793, + "reward_std": 0.3474937528371811, + "rewards/final_reward": 1.0248634815796156, + "rewards/mask_iou_reward": 0.5124317407898078, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.244857370853424, + "rewards/thk_ans_format_reward": 1.0, + "step": 887, + "think_completion_length": 58.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.375, + "epoch": 1.4991568296795954, + "grad_norm": 26.551364418368514, + "kl": 0.439453125, + "learning_rate": 7.005059021922428e-07, + "loss": 0.0004, + "reward": 3.463243246078491, + "reward_std": 0.481712244451046, + "rewards/final_reward": 1.6564740215537062, + "rewards/mask_iou_reward": 0.8282370107768531, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.4788681864738464, + "rewards/thk_ans_format_reward": 1.0, + "step": 888, + "think_completion_length": 52.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.375, + "epoch": 1.5008431703204046, + "grad_norm": 25.417332706180506, + "kl": 0.470703125, + "learning_rate": 7.001686340640809e-07, + "loss": 0.0005, + "reward": 2.934293508529663, + "reward_std": 0.1236649677157402, + "rewards/final_reward": 1.0390162157073441, + "rewards/mask_iou_reward": 0.5195081078536721, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9342935383319855, + "rewards/thk_ans_format_reward": 1.0, + "step": 889, + "think_completion_length": 52.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.25, + "epoch": 1.5025295109612142, + "grad_norm": 7.780530546518989, + "kl": 0.544921875, + "learning_rate": 6.998313659359191e-07, + "loss": 0.0005, + "reward": 3.188772201538086, + "reward_std": 0.2717321440577507, + "rewards/final_reward": 0.9025040048437754, + "rewards/mask_iou_reward": 0.4512520024218877, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.188772201538086, + "rewards/thk_ans_format_reward": 1.0, + "step": 890, + "think_completion_length": 56.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.390625, + "epoch": 1.5042158516020236, + "grad_norm": 6.45209714621979, + "kl": 0.44921875, + "learning_rate": 6.994940978077571e-07, + "loss": 0.0005, + "reward": 3.103861451148987, + "reward_std": 0.1689981073141098, + "rewards/final_reward": 1.4753462516466826, + "rewards/mask_iou_reward": 0.7376731258233413, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1038614511489868, + "rewards/thk_ans_format_reward": 1.0, + "step": 891, + "think_completion_length": 45.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.96875, + "epoch": 1.505902192242833, + "grad_norm": 28.14646763114755, + "kl": 0.439453125, + "learning_rate": 6.991568296795952e-07, + "loss": 0.0004, + "reward": 3.271444320678711, + "reward_std": 0.038284238427877426, + "rewards/final_reward": 1.3904845505095014, + "rewards/mask_iou_reward": 0.6952422752547507, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2714443802833557, + "rewards/thk_ans_format_reward": 1.0, + "step": 892, + "think_completion_length": 52.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.875, + "epoch": 1.5075885328836425, + "grad_norm": 7.348843640714404, + "kl": 0.4296875, + "learning_rate": 6.988195615514334e-07, + "loss": 0.0004, + "reward": 3.3650012016296387, + "reward_std": 0.23928017914295197, + "rewards/final_reward": 1.1050898513339247, + "rewards/mask_iou_reward": 0.5525449256669623, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.365001142024994, + "rewards/thk_ans_format_reward": 1.0, + "step": 893, + "think_completion_length": 62.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.6875, + "epoch": 1.5092748735244519, + "grad_norm": 58.76319880638057, + "kl": 0.513671875, + "learning_rate": 6.984822934232715e-07, + "loss": 0.0005, + "reward": 3.389597177505493, + "reward_std": 0.2816731631755829, + "rewards/final_reward": 1.569462222983449, + "rewards/mask_iou_reward": 0.7847311114917245, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3895971775054932, + "rewards/thk_ans_format_reward": 1.0, + "step": 894, + "think_completion_length": 48.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.90625, + "epoch": 1.5109612141652615, + "grad_norm": 5.146659429828279, + "kl": 0.46484375, + "learning_rate": 6.981450252951096e-07, + "loss": 0.0005, + "reward": 3.0576419830322266, + "reward_std": 0.19554530084133148, + "rewards/final_reward": 0.757199370585173, + "rewards/mask_iou_reward": 0.3785996852925865, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0576421320438385, + "rewards/thk_ans_format_reward": 1.0, + "step": 895, + "think_completion_length": 56.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.5625, + "epoch": 1.5126475548060707, + "grad_norm": 8.009726459732972, + "kl": 0.490234375, + "learning_rate": 6.978077571669477e-07, + "loss": 0.0005, + "reward": 2.517244815826416, + "reward_std": 0.1560732051730156, + "rewards/final_reward": 0.7029032977709968, + "rewards/mask_iou_reward": 0.3514516488854984, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5172448828816414, + "rewards/thk_ans_format_reward": 1.0, + "step": 896, + "think_completion_length": 53.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.25, + "epoch": 1.5143338954468804, + "grad_norm": 11.691308332853232, + "kl": 0.439453125, + "learning_rate": 6.974704890387858e-07, + "loss": 0.0004, + "reward": 2.8864370584487915, + "reward_std": 0.3114437907934189, + "rewards/final_reward": 1.1898409841779651, + "rewards/mask_iou_reward": 0.5949204920889826, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8864371180534363, + "rewards/thk_ans_format_reward": 1.0, + "step": 897, + "think_completion_length": 54.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.828125, + "epoch": 1.5160202360876898, + "grad_norm": 20.22786026987595, + "kl": 0.4736328125, + "learning_rate": 6.971332209106239e-07, + "loss": 0.0005, + "reward": 3.25240159034729, + "reward_std": 0.38694237172603607, + "rewards/final_reward": 1.4252261662740588, + "rewards/mask_iou_reward": 0.7126130831370294, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.2680267095565796, + "rewards/thk_ans_format_reward": 1.0, + "step": 898, + "think_completion_length": 54.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.09375, + "epoch": 1.5177065767284992, + "grad_norm": 4.799547084634126, + "kl": 1.21484375, + "learning_rate": 6.96795952782462e-07, + "loss": 0.0012, + "reward": 3.003826141357422, + "reward_std": 0.3299378901720047, + "rewards/final_reward": 0.8713669023703968, + "rewards/mask_iou_reward": 0.4356834511851984, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.0194511413574219, + "rewards/thk_ans_format_reward": 1.0, + "step": 899, + "think_completion_length": 51.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.015625, + "epoch": 1.5193929173693086, + "grad_norm": 8.586444312018184, + "kl": 0.447265625, + "learning_rate": 6.964586846543001e-07, + "loss": 0.0004, + "reward": 3.177174210548401, + "reward_std": 0.21209881751565263, + "rewards/final_reward": 1.8933416244051449, + "rewards/mask_iou_reward": 0.9466708122025724, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1771742403507233, + "rewards/thk_ans_format_reward": 1.0, + "step": 900, + "think_completion_length": 56.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.890625, + "epoch": 1.521079258010118, + "grad_norm": 6.139072484754312, + "kl": 0.421875, + "learning_rate": 6.961214165261382e-07, + "loss": 0.0004, + "reward": 2.9001829624176025, + "reward_std": 0.11913806945085526, + "rewards/final_reward": 1.1471034366880408, + "rewards/mask_iou_reward": 0.5735517183440204, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9001829624176025, + "rewards/thk_ans_format_reward": 1.0, + "step": 901, + "think_completion_length": 56.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.984375, + "epoch": 1.5227655986509276, + "grad_norm": 38.17727674571188, + "kl": 0.435546875, + "learning_rate": 6.957841483979764e-07, + "loss": 0.0004, + "reward": 3.7425217628479004, + "reward_std": 0.033854744397103786, + "rewards/final_reward": 1.688420361475836, + "rewards/mask_iou_reward": 0.844210180737918, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7425217628479004, + "rewards/thk_ans_format_reward": 1.0, + "step": 902, + "think_completion_length": 59.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.6875, + "epoch": 1.5244519392917368, + "grad_norm": 17.620597318581506, + "kl": 0.4287109375, + "learning_rate": 6.954468802698145e-07, + "loss": 0.0004, + "reward": 2.8093347549438477, + "reward_std": 0.08729386702179909, + "rewards/final_reward": 0.4415490172857114, + "rewards/mask_iou_reward": 0.2207745086428557, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8093347251415253, + "rewards/thk_ans_format_reward": 1.0, + "step": 903, + "think_completion_length": 62.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.90625, + "epoch": 1.5261382799325465, + "grad_norm": 81.3433205071691, + "kl": 0.431640625, + "learning_rate": 6.951096121416526e-07, + "loss": 0.0004, + "reward": 3.593350052833557, + "reward_std": 0.22224682942032814, + "rewards/final_reward": 1.5034894963403924, + "rewards/mask_iou_reward": 0.7517447481701962, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5933499336242676, + "rewards/thk_ans_format_reward": 1.0, + "step": 904, + "think_completion_length": 56.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.3125, + "epoch": 1.5278246205733557, + "grad_norm": 4.356153558926586, + "kl": 0.390625, + "learning_rate": 6.947723440134907e-07, + "loss": 0.0004, + "reward": 3.0900958776474, + "reward_std": 0.21000684797763824, + "rewards/final_reward": 0.9526622565340548, + "rewards/mask_iou_reward": 0.4763311282670274, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0900957882404327, + "rewards/thk_ans_format_reward": 1.0, + "step": 905, + "think_completion_length": 64.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.234375, + "epoch": 1.5295109612141653, + "grad_norm": 11.420126935243049, + "kl": 0.41796875, + "learning_rate": 6.944350758853288e-07, + "loss": 0.0004, + "reward": 3.141783356666565, + "reward_std": 0.12532974779605865, + "rewards/final_reward": 1.3057637626816818, + "rewards/mask_iou_reward": 0.6528818813408409, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.141783207654953, + "rewards/thk_ans_format_reward": 1.0, + "step": 906, + "think_completion_length": 51.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.4375, + "epoch": 1.5311973018549747, + "grad_norm": 33.024581244226404, + "kl": 0.443359375, + "learning_rate": 6.940978077571668e-07, + "loss": 0.0004, + "reward": 3.2856587171554565, + "reward_std": 0.23818902671337128, + "rewards/final_reward": 1.5964721680485652, + "rewards/mask_iou_reward": 0.7982360840242826, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2856586575508118, + "rewards/thk_ans_format_reward": 1.0, + "step": 907, + "think_completion_length": 59.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.546875, + "epoch": 1.5328836424957841, + "grad_norm": 4.441225018652143, + "kl": 0.498046875, + "learning_rate": 6.93760539629005e-07, + "loss": 0.0005, + "reward": 3.2125693559646606, + "reward_std": 0.17255272343754768, + "rewards/final_reward": 1.3438855575718396, + "rewards/mask_iou_reward": 0.6719427787859198, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2125694155693054, + "rewards/thk_ans_format_reward": 1.0, + "step": 908, + "think_completion_length": 58.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.078125, + "epoch": 1.5345699831365935, + "grad_norm": 6.618112377181082, + "kl": 0.3896484375, + "learning_rate": 6.934232715008431e-07, + "loss": 0.0004, + "reward": 3.194159507751465, + "reward_std": 0.2769739478826523, + "rewards/final_reward": 0.8722471206493585, + "rewards/mask_iou_reward": 0.43612356032467925, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1941594183444977, + "rewards/thk_ans_format_reward": 1.0, + "step": 909, + "think_completion_length": 55.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.296875, + "epoch": 1.536256323777403, + "grad_norm": 3.976682860182766, + "kl": 0.431640625, + "learning_rate": 6.930860033726813e-07, + "loss": 0.0004, + "reward": 3.5049896240234375, + "reward_std": 0.09566706418991089, + "rewards/final_reward": 1.4766909435750448, + "rewards/mask_iou_reward": 0.7383454717875224, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5049898028373718, + "rewards/thk_ans_format_reward": 1.0, + "step": 910, + "think_completion_length": 55.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.65625, + "epoch": 1.5379426644182126, + "grad_norm": 8.674155382800526, + "kl": 0.515625, + "learning_rate": 6.927487352445194e-07, + "loss": 0.0005, + "reward": 3.68453848361969, + "reward_std": 0.3740627020597458, + "rewards/final_reward": 1.6059642488093178, + "rewards/mask_iou_reward": 0.8029821244046589, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6845386028289795, + "rewards/thk_ans_format_reward": 1.0, + "step": 911, + "think_completion_length": 60.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.828125, + "epoch": 1.5396290050590218, + "grad_norm": 4.201349564079745, + "kl": 0.4345703125, + "learning_rate": 6.924114671163575e-07, + "loss": 0.0004, + "reward": 2.7524369955062866, + "reward_std": 0.06399728916585445, + "rewards/final_reward": 0.05723391880944776, + "rewards/mask_iou_reward": 0.02861695940472388, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.752437025308609, + "rewards/thk_ans_format_reward": 1.0, + "step": 912, + "think_completion_length": 57.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.640625, + "epoch": 1.5413153456998314, + "grad_norm": 15.707962181930915, + "kl": 0.482421875, + "learning_rate": 6.920741989881957e-07, + "loss": 0.0005, + "reward": 3.2202765941619873, + "reward_std": 0.2580166608095169, + "rewards/final_reward": 1.5070204731055663, + "rewards/mask_iou_reward": 0.7535102365527832, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.220276415348053, + "rewards/thk_ans_format_reward": 1.0, + "step": 913, + "think_completion_length": 52.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.234375, + "epoch": 1.5430016863406408, + "grad_norm": 4.612716388916718, + "kl": 0.46484375, + "learning_rate": 6.917369308600337e-07, + "loss": 0.0005, + "reward": 3.229029417037964, + "reward_std": 0.2685174345970154, + "rewards/final_reward": 1.2355635873006694, + "rewards/mask_iou_reward": 0.6177817936503347, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.229029357433319, + "rewards/thk_ans_format_reward": 1.0, + "step": 914, + "think_completion_length": 61.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.75, + "epoch": 1.5446880269814502, + "grad_norm": 9.71609930718754, + "kl": 0.412109375, + "learning_rate": 6.913996627318718e-07, + "loss": 0.0004, + "reward": 3.4811172485351562, + "reward_std": 0.18026774376630783, + "rewards/final_reward": 1.3179288333104693, + "rewards/mask_iou_reward": 0.6589644166552346, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.481117308139801, + "rewards/thk_ans_format_reward": 1.0, + "step": 915, + "think_completion_length": 58.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.484375, + "epoch": 1.5463743676222597, + "grad_norm": 6.062189232886983, + "kl": 0.388671875, + "learning_rate": 6.910623946037099e-07, + "loss": 0.0004, + "reward": 3.1265569925308228, + "reward_std": 0.11232495307922363, + "rewards/final_reward": 1.1716458595219756, + "rewards/mask_iou_reward": 0.5858229297609878, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1265568435192108, + "rewards/thk_ans_format_reward": 1.0, + "step": 916, + "think_completion_length": 55.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.46875, + "epoch": 1.548060708263069, + "grad_norm": 29.322825812695466, + "kl": 0.40625, + "learning_rate": 6.90725126475548e-07, + "loss": 0.0004, + "reward": 3.158001184463501, + "reward_std": 0.3152724876999855, + "rewards/final_reward": 1.6397486316758947, + "rewards/mask_iou_reward": 0.8198743158379473, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1580011546611786, + "rewards/thk_ans_format_reward": 1.0, + "step": 917, + "think_completion_length": 63.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.15625, + "epoch": 1.5497470489038787, + "grad_norm": 4.637874584865913, + "kl": 0.431640625, + "learning_rate": 6.903878583473861e-07, + "loss": 0.0004, + "reward": 3.1512595415115356, + "reward_std": 0.05409781076014042, + "rewards/final_reward": 1.146352360593079, + "rewards/mask_iou_reward": 0.5731761802965395, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1512594819068909, + "rewards/thk_ans_format_reward": 1.0, + "step": 918, + "think_completion_length": 59.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.0, + "epoch": 1.551433389544688, + "grad_norm": 7.788632543194693, + "kl": 0.474609375, + "learning_rate": 6.900505902192243e-07, + "loss": 0.0005, + "reward": 3.1258350610733032, + "reward_std": 0.07057809643447399, + "rewards/final_reward": 0.8358368847323898, + "rewards/mask_iou_reward": 0.4179184423661949, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1258350908756256, + "rewards/thk_ans_format_reward": 1.0, + "step": 919, + "think_completion_length": 58.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.0625, + "epoch": 1.5531197301854975, + "grad_norm": 5.97941155669541, + "kl": 0.4501953125, + "learning_rate": 6.897133220910624e-07, + "loss": 0.0004, + "reward": 3.0851335525512695, + "reward_std": 0.22871796786785126, + "rewards/final_reward": 1.0473059909656413, + "rewards/mask_iou_reward": 0.5236529954828206, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0851335227489471, + "rewards/thk_ans_format_reward": 1.0, + "step": 920, + "think_completion_length": 55.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.640625, + "epoch": 1.554806070826307, + "grad_norm": 4.936714793425599, + "kl": 0.4384765625, + "learning_rate": 6.893760539629005e-07, + "loss": 0.0004, + "reward": 3.1050972938537598, + "reward_std": 0.12275232374668121, + "rewards/final_reward": 1.113718007247768, + "rewards/mask_iou_reward": 0.556859003623884, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1050972640514374, + "rewards/thk_ans_format_reward": 1.0, + "step": 921, + "think_completion_length": 54.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.25, + "epoch": 1.5564924114671164, + "grad_norm": 10.771346363486499, + "kl": 0.443359375, + "learning_rate": 6.890387858347387e-07, + "loss": 0.0004, + "reward": 3.220325231552124, + "reward_std": 0.22513248771429062, + "rewards/final_reward": 1.4726670164797664, + "rewards/mask_iou_reward": 0.7363335082398832, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.220325231552124, + "rewards/thk_ans_format_reward": 1.0, + "step": 922, + "think_completion_length": 54.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.03125, + "epoch": 1.5581787521079258, + "grad_norm": 6.072745008400607, + "kl": 0.4453125, + "learning_rate": 6.887015177065767e-07, + "loss": 0.0004, + "reward": 2.69256329536438, + "reward_std": 0.2358478605747223, + "rewards/final_reward": 1.005379959545185, + "rewards/mask_iou_reward": 0.5026899797725926, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.708188384771347, + "rewards/thk_ans_format_reward": 1.0, + "step": 923, + "think_completion_length": 58.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.296875, + "epoch": 1.5598650927487352, + "grad_norm": 5.613134816077137, + "kl": 0.462890625, + "learning_rate": 6.883642495784147e-07, + "loss": 0.0005, + "reward": 3.6800777912139893, + "reward_std": 0.07642281427979469, + "rewards/final_reward": 1.8789947092503736, + "rewards/mask_iou_reward": 0.9394973546251868, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6800777912139893, + "rewards/thk_ans_format_reward": 1.0, + "step": 924, + "think_completion_length": 60.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.125, + "epoch": 1.5615514333895448, + "grad_norm": 5.487692228810072, + "kl": 0.4228515625, + "learning_rate": 6.880269814502529e-07, + "loss": 0.0004, + "reward": 2.868987202644348, + "reward_std": 0.18398159742355347, + "rewards/final_reward": 0.4337001279491935, + "rewards/mask_iou_reward": 0.21685006397459675, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8689871728420258, + "rewards/thk_ans_format_reward": 1.0, + "step": 925, + "think_completion_length": 63.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.59375, + "epoch": 1.563237774030354, + "grad_norm": 8.794735029230512, + "kl": 0.453125, + "learning_rate": 6.87689713322091e-07, + "loss": 0.0005, + "reward": 3.198965072631836, + "reward_std": 0.21983014419674873, + "rewards/final_reward": 1.1240214130545105, + "rewards/mask_iou_reward": 0.5620107065272553, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1989650130271912, + "rewards/thk_ans_format_reward": 1.0, + "step": 926, + "think_completion_length": 59.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.671875, + "epoch": 1.5649241146711637, + "grad_norm": 12.984976043343616, + "kl": 0.521484375, + "learning_rate": 6.873524451939291e-07, + "loss": 0.0005, + "reward": 3.4763259887695312, + "reward_std": 0.15147725492715836, + "rewards/final_reward": 1.8704005807661068, + "rewards/mask_iou_reward": 0.9352002903830534, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4763262271881104, + "rewards/thk_ans_format_reward": 1.0, + "step": 927, + "think_completion_length": 54.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.0, + "epoch": 1.566610455311973, + "grad_norm": 5.291031497794727, + "kl": 0.470703125, + "learning_rate": 6.870151770657673e-07, + "loss": 0.0005, + "reward": 3.0472246408462524, + "reward_std": 0.371063232421875, + "rewards/final_reward": 1.653832077696684, + "rewards/mask_iou_reward": 0.826916038848342, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0472246408462524, + "rewards/thk_ans_format_reward": 1.0, + "step": 928, + "think_completion_length": 50.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.71875, + "epoch": 1.5682967959527825, + "grad_norm": 5.172504836959009, + "kl": 0.4326171875, + "learning_rate": 6.866779089376054e-07, + "loss": 0.0004, + "reward": 2.9630849361419678, + "reward_std": 0.03674542997032404, + "rewards/final_reward": 0.9665219127822793, + "rewards/mask_iou_reward": 0.48326095639113964, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.963085075840354, + "rewards/thk_ans_format_reward": 1.0, + "step": 929, + "think_completion_length": 55.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.78125, + "epoch": 1.569983136593592, + "grad_norm": 6.651356452309486, + "kl": 0.4130859375, + "learning_rate": 6.863406408094435e-07, + "loss": 0.0004, + "reward": 3.1770557165145874, + "reward_std": 0.14116919820662588, + "rewards/final_reward": 1.7948890625633755, + "rewards/mask_iou_reward": 0.8974445312816878, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1770557463169098, + "rewards/thk_ans_format_reward": 1.0, + "step": 930, + "think_completion_length": 55.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.4375, + "epoch": 1.5716694772344013, + "grad_norm": 8.669947318491714, + "kl": 0.42578125, + "learning_rate": 6.860033726812817e-07, + "loss": 0.0004, + "reward": 3.5581700801849365, + "reward_std": 0.10219700261950493, + "rewards/final_reward": 1.5715215638037199, + "rewards/mask_iou_reward": 0.7857607819018599, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5581701397895813, + "rewards/thk_ans_format_reward": 1.0, + "step": 931, + "think_completion_length": 64.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.140625, + "epoch": 1.573355817875211, + "grad_norm": 16.21093382898676, + "kl": 0.4111328125, + "learning_rate": 6.856661045531196e-07, + "loss": 0.0004, + "reward": 2.934127926826477, + "reward_std": 0.17660583928227425, + "rewards/final_reward": 0.6131344617154153, + "rewards/mask_iou_reward": 0.30656723085770765, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9341279566287994, + "rewards/thk_ans_format_reward": 1.0, + "step": 932, + "think_completion_length": 56.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.671875, + "epoch": 1.5750421585160201, + "grad_norm": 17.569316127594202, + "kl": 0.4677734375, + "learning_rate": 6.853288364249577e-07, + "loss": 0.0005, + "reward": 3.4104976654052734, + "reward_std": 0.17937590926885605, + "rewards/final_reward": 1.079353207141879, + "rewards/mask_iou_reward": 0.5396766035709395, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4104976654052734, + "rewards/thk_ans_format_reward": 1.0, + "step": 933, + "think_completion_length": 56.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.46875, + "epoch": 1.5767284991568298, + "grad_norm": 36.84957525298316, + "kl": 0.4111328125, + "learning_rate": 6.849915682967959e-07, + "loss": 0.0004, + "reward": 3.0988067388534546, + "reward_std": 0.084061773493886, + "rewards/final_reward": 1.296683767640328, + "rewards/mask_iou_reward": 0.648341883820164, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0988066494464874, + "rewards/thk_ans_format_reward": 1.0, + "step": 934, + "think_completion_length": 54.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.15625, + "epoch": 1.578414839797639, + "grad_norm": 7.168885477010204, + "kl": 0.41796875, + "learning_rate": 6.84654300168634e-07, + "loss": 0.0004, + "reward": 2.9668532609939575, + "reward_std": 0.15460747107863426, + "rewards/final_reward": 0.9098481944969506, + "rewards/mask_iou_reward": 0.4549240972484753, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9668533802032471, + "rewards/thk_ans_format_reward": 1.0, + "step": 935, + "think_completion_length": 54.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.3125, + "epoch": 1.5801011804384486, + "grad_norm": 10.38616655853377, + "kl": 0.41015625, + "learning_rate": 6.843170320404722e-07, + "loss": 0.0004, + "reward": 3.1787116527557373, + "reward_std": 0.22786857932806015, + "rewards/final_reward": 1.3526219058215783, + "rewards/mask_iou_reward": 0.6763109529107891, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.178711622953415, + "rewards/thk_ans_format_reward": 1.0, + "step": 936, + "think_completion_length": 58.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.3125, + "epoch": 1.581787521079258, + "grad_norm": 15.467819146162833, + "kl": 0.462890625, + "learning_rate": 6.839797639123103e-07, + "loss": 0.0005, + "reward": 2.5192718505859375, + "reward_std": 0.08620522171258926, + "rewards/final_reward": 0.8012095167810038, + "rewards/mask_iou_reward": 0.4006047583905019, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5192718356847763, + "rewards/thk_ans_format_reward": 1.0, + "step": 937, + "think_completion_length": 56.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.703125, + "epoch": 1.5834738617200674, + "grad_norm": 6.545789979099022, + "kl": 0.51953125, + "learning_rate": 6.836424957841484e-07, + "loss": 0.0005, + "reward": 3.624456763267517, + "reward_std": 0.041466801427304745, + "rewards/final_reward": 1.7924130592935685, + "rewards/mask_iou_reward": 0.8962065296467843, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6244568228721619, + "rewards/thk_ans_format_reward": 1.0, + "step": 938, + "think_completion_length": 50.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.296875, + "epoch": 1.5851602023608768, + "grad_norm": 4.387297016647714, + "kl": 0.421875, + "learning_rate": 6.833052276559866e-07, + "loss": 0.0004, + "reward": 3.1855965852737427, + "reward_std": 0.06531479209661484, + "rewards/final_reward": 1.4869811079802981, + "rewards/mask_iou_reward": 0.7434905539901491, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.185596525669098, + "rewards/thk_ans_format_reward": 1.0, + "step": 939, + "think_completion_length": 52.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.671875, + "epoch": 1.5868465430016863, + "grad_norm": 9.437264321195768, + "kl": 0.40625, + "learning_rate": 6.829679595278247e-07, + "loss": 0.0004, + "reward": 3.1889456510543823, + "reward_std": 0.3177672028541565, + "rewards/final_reward": 1.5420257945976152, + "rewards/mask_iou_reward": 0.7710128972988076, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.2045705914497375, + "rewards/thk_ans_format_reward": 1.0, + "step": 940, + "think_completion_length": 54.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.640625, + "epoch": 1.588532883642496, + "grad_norm": 6.655029831675907, + "kl": 0.439453125, + "learning_rate": 6.826306913996626e-07, + "loss": 0.0004, + "reward": 2.6530632972717285, + "reward_std": 0.3725260943174362, + "rewards/final_reward": 0.7438548296706913, + "rewards/mask_iou_reward": 0.37192741483534564, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6530634164810181, + "rewards/thk_ans_format_reward": 1.0, + "step": 941, + "think_completion_length": 59.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.421875, + "epoch": 1.590219224283305, + "grad_norm": 22.97720761020874, + "kl": 0.4091796875, + "learning_rate": 6.822934232715008e-07, + "loss": 0.0004, + "reward": 2.9522364139556885, + "reward_std": 0.23722931742668152, + "rewards/final_reward": 0.7601074556303956, + "rewards/mask_iou_reward": 0.3800537278151978, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9522364139556885, + "rewards/thk_ans_format_reward": 1.0, + "step": 942, + "think_completion_length": 54.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.078125, + "epoch": 1.5919055649241147, + "grad_norm": 4.557053886602248, + "kl": 0.4296875, + "learning_rate": 6.819561551433389e-07, + "loss": 0.0004, + "reward": 3.2444454431533813, + "reward_std": 0.09333648579195142, + "rewards/final_reward": 0.6803116547975444, + "rewards/mask_iou_reward": 0.3401558273987722, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2444454729557037, + "rewards/thk_ans_format_reward": 1.0, + "step": 943, + "think_completion_length": 53.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.265625, + "epoch": 1.5935919055649241, + "grad_norm": 6.630359254295777, + "kl": 0.4296875, + "learning_rate": 6.81618887015177e-07, + "loss": 0.0004, + "reward": 3.813447594642639, + "reward_std": 0.13165267184376717, + "rewards/final_reward": 1.8897588433100132, + "rewards/mask_iou_reward": 0.9448794216550066, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.813447654247284, + "rewards/thk_ans_format_reward": 1.0, + "step": 944, + "think_completion_length": 54.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.65625, + "epoch": 1.5952782462057336, + "grad_norm": 6.296176030816266, + "kl": 0.40625, + "learning_rate": 6.812816188870152e-07, + "loss": 0.0004, + "reward": 3.185824394226074, + "reward_std": 0.1516023352742195, + "rewards/final_reward": 0.5434738576778565, + "rewards/mask_iou_reward": 0.27173692883892825, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1858242750167847, + "rewards/thk_ans_format_reward": 1.0, + "step": 945, + "think_completion_length": 57.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.96875, + "epoch": 1.596964586846543, + "grad_norm": 7.362326699822068, + "kl": 0.50390625, + "learning_rate": 6.809443507588533e-07, + "loss": 0.0005, + "reward": 3.2611594200134277, + "reward_std": 0.16684667952358723, + "rewards/final_reward": 0.858870452074543, + "rewards/mask_iou_reward": 0.4294352260372715, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2611593306064606, + "rewards/thk_ans_format_reward": 1.0, + "step": 946, + "think_completion_length": 53.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.78125, + "epoch": 1.5986509274873524, + "grad_norm": 10.544323839047761, + "kl": 0.4716796875, + "learning_rate": 6.806070826306914e-07, + "loss": 0.0005, + "reward": 2.8881239891052246, + "reward_std": 0.09691545739769936, + "rewards/final_reward": 0.4005539385653958, + "rewards/mask_iou_reward": 0.2002769692826979, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8881239891052246, + "rewards/thk_ans_format_reward": 1.0, + "step": 947, + "think_completion_length": 50.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.265625, + "epoch": 1.600337268128162, + "grad_norm": 6.359182683112394, + "kl": 0.3974609375, + "learning_rate": 6.802698145025296e-07, + "loss": 0.0004, + "reward": 3.300767660140991, + "reward_std": 0.21483464539051056, + "rewards/final_reward": 1.1850023478592198, + "rewards/mask_iou_reward": 0.5925011739296099, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3007675409317017, + "rewards/thk_ans_format_reward": 1.0, + "step": 948, + "think_completion_length": 48.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.21875, + "epoch": 1.6020236087689712, + "grad_norm": 5.49566363170077, + "kl": 0.41796875, + "learning_rate": 6.799325463743675e-07, + "loss": 0.0004, + "reward": 3.170168161392212, + "reward_std": 0.19993770122528076, + "rewards/final_reward": 1.6430170837061473, + "rewards/mask_iou_reward": 0.8215085418530736, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.170168161392212, + "rewards/thk_ans_format_reward": 1.0, + "step": 949, + "think_completion_length": 49.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.75, + "epoch": 1.6037099494097808, + "grad_norm": 13.203439988715884, + "kl": 0.505859375, + "learning_rate": 6.795952782462056e-07, + "loss": 0.0005, + "reward": 2.7104690074920654, + "reward_std": 0.38839419186115265, + "rewards/final_reward": 0.6529316778986988, + "rewards/mask_iou_reward": 0.3264658389493494, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.7260940372943878, + "rewards/thk_ans_format_reward": 1.0, + "step": 950, + "think_completion_length": 53.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.09375, + "epoch": 1.6053962900505903, + "grad_norm": 13.01189056246978, + "kl": 0.4462890625, + "learning_rate": 6.792580101180438e-07, + "loss": 0.0004, + "reward": 2.8600813150405884, + "reward_std": 0.17967913672327995, + "rewards/final_reward": 1.1577608656838663, + "rewards/mask_iou_reward": 0.5788804328419331, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.8913313150405884, + "rewards/thk_ans_format_reward": 0.984375, + "step": 951, + "think_completion_length": 53.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.03125, + "epoch": 1.6070826306913997, + "grad_norm": 5.747511025787855, + "kl": 0.49609375, + "learning_rate": 6.789207419898819e-07, + "loss": 0.0005, + "reward": 3.1585750579833984, + "reward_std": 0.23563334345817566, + "rewards/final_reward": 0.6743248054644104, + "rewards/mask_iou_reward": 0.3371624027322052, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1585749387741089, + "rewards/thk_ans_format_reward": 1.0, + "step": 952, + "think_completion_length": 53.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.78125, + "epoch": 1.608768971332209, + "grad_norm": 8.223043903697763, + "kl": 0.50390625, + "learning_rate": 6.7858347386172e-07, + "loss": 0.0005, + "reward": 3.6050385236740112, + "reward_std": 0.14627795293927193, + "rewards/final_reward": 1.7853409096681372, + "rewards/mask_iou_reward": 0.8926704548340686, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6050386428833008, + "rewards/thk_ans_format_reward": 1.0, + "step": 953, + "think_completion_length": 60.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.234375, + "epoch": 1.6104553119730185, + "grad_norm": 7.414075157118034, + "kl": 0.4140625, + "learning_rate": 6.782462057335582e-07, + "loss": 0.0004, + "reward": 2.9460577964782715, + "reward_std": 0.1564902514219284, + "rewards/final_reward": 0.6832645173258263, + "rewards/mask_iou_reward": 0.34163225866291314, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9460577964782715, + "rewards/thk_ans_format_reward": 1.0, + "step": 954, + "think_completion_length": 63.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.5625, + "epoch": 1.6121416526138281, + "grad_norm": 11.447181386646438, + "kl": 0.49609375, + "learning_rate": 6.779089376053963e-07, + "loss": 0.0005, + "reward": 3.4335960149765015, + "reward_std": 0.270451620221138, + "rewards/final_reward": 1.47075722453424, + "rewards/mask_iou_reward": 0.73537861226712, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.433595895767212, + "rewards/thk_ans_format_reward": 1.0, + "step": 955, + "think_completion_length": 61.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.921875, + "epoch": 1.6138279932546373, + "grad_norm": 18.76508612601952, + "kl": 0.44921875, + "learning_rate": 6.775716694772344e-07, + "loss": 0.0004, + "reward": 3.172788619995117, + "reward_std": 0.12457035994157195, + "rewards/final_reward": 0.7677162580570118, + "rewards/mask_iou_reward": 0.3838581290285059, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.172788679599762, + "rewards/thk_ans_format_reward": 1.0, + "step": 956, + "think_completion_length": 50.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.9375, + "epoch": 1.615514333895447, + "grad_norm": 7.851729484504201, + "kl": 0.4345703125, + "learning_rate": 6.772344013490725e-07, + "loss": 0.0004, + "reward": 2.9846887588500977, + "reward_std": 0.2687959522008896, + "rewards/final_reward": 0.6911135867581653, + "rewards/mask_iou_reward": 0.3455567933790826, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9846886992454529, + "rewards/thk_ans_format_reward": 1.0, + "step": 957, + "think_completion_length": 57.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.25, + "epoch": 1.6172006745362564, + "grad_norm": 5.848554755465288, + "kl": 0.439453125, + "learning_rate": 6.768971332209105e-07, + "loss": 0.0004, + "reward": 3.0769588947296143, + "reward_std": 0.11406697146594524, + "rewards/final_reward": 1.6693445781968697, + "rewards/mask_iou_reward": 0.8346722890984348, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0769589841365814, + "rewards/thk_ans_format_reward": 1.0, + "step": 958, + "think_completion_length": 49.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.765625, + "epoch": 1.6188870151770658, + "grad_norm": 4.807242982976459, + "kl": 0.4140625, + "learning_rate": 6.765598650927486e-07, + "loss": 0.0004, + "reward": 3.054129123687744, + "reward_std": 0.19870612863451242, + "rewards/final_reward": 1.0483865036756663, + "rewards/mask_iou_reward": 0.5241932518378332, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.0697540044784546, + "rewards/thk_ans_format_reward": 1.0, + "step": 959, + "think_completion_length": 52.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.796875, + "epoch": 1.6205733558178752, + "grad_norm": 4.898922143259208, + "kl": 0.423828125, + "learning_rate": 6.762225969645868e-07, + "loss": 0.0004, + "reward": 3.6588209867477417, + "reward_std": 0.04261109419167042, + "rewards/final_reward": 1.5577609229973175, + "rewards/mask_iou_reward": 0.7788804614986587, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6588210463523865, + "rewards/thk_ans_format_reward": 1.0, + "step": 960, + "think_completion_length": 50.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.625, + "epoch": 1.6222596964586846, + "grad_norm": 14.867085837269215, + "kl": 0.474609375, + "learning_rate": 6.758853288364249e-07, + "loss": 0.0005, + "reward": 3.124504804611206, + "reward_std": 0.12859491258859634, + "rewards/final_reward": 0.8346335177809561, + "rewards/mask_iou_reward": 0.41731675889047803, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1245048642158508, + "rewards/thk_ans_format_reward": 1.0, + "step": 961, + "think_completion_length": 52.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.265625, + "epoch": 1.6239460370994943, + "grad_norm": 5.2997309744790035, + "kl": 0.4267578125, + "learning_rate": 6.755480607082631e-07, + "loss": 0.0004, + "reward": 3.4570316076278687, + "reward_std": 0.05912900622934103, + "rewards/final_reward": 1.0982445093578157, + "rewards/mask_iou_reward": 0.5491222546789079, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.457031488418579, + "rewards/thk_ans_format_reward": 1.0, + "step": 962, + "think_completion_length": 54.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.171875, + "epoch": 1.6256323777403034, + "grad_norm": 5.236334173922632, + "kl": 0.4912109375, + "learning_rate": 6.752107925801012e-07, + "loss": 0.0003, + "reward": 3.1656047105789185, + "reward_std": 0.09970117919147015, + "rewards/final_reward": 0.9881815716090991, + "rewards/mask_iou_reward": 0.49409078580454957, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1656047105789185, + "rewards/thk_ans_format_reward": 1.0, + "step": 963, + "think_completion_length": 53.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.328125, + "epoch": 1.627318718381113, + "grad_norm": 8.80017290037366, + "kl": 0.4306640625, + "learning_rate": 6.748735244519393e-07, + "loss": 0.0004, + "reward": 3.161680221557617, + "reward_std": 0.1751057505607605, + "rewards/final_reward": 1.6679947373879087, + "rewards/mask_iou_reward": 0.8339973686939544, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.1773052215576172, + "rewards/thk_ans_format_reward": 1.0, + "step": 964, + "think_completion_length": 57.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.21875, + "epoch": 1.6290050590219223, + "grad_norm": 9.93252740921444, + "kl": 0.458984375, + "learning_rate": 6.745362563237775e-07, + "loss": 0.0005, + "reward": 3.409914016723633, + "reward_std": 0.31119733303785324, + "rewards/final_reward": 1.532120199761723, + "rewards/mask_iou_reward": 0.7660600998808615, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.4255390763282776, + "rewards/thk_ans_format_reward": 1.0, + "step": 965, + "think_completion_length": 52.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.234375, + "epoch": 1.630691399662732, + "grad_norm": 16.77150015777779, + "kl": 0.55859375, + "learning_rate": 6.741989881956155e-07, + "loss": 0.0006, + "reward": 3.312274217605591, + "reward_std": 0.14685458689928055, + "rewards/final_reward": 1.7361032842907864, + "rewards/mask_iou_reward": 0.8680516421453932, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3122743964195251, + "rewards/thk_ans_format_reward": 1.0, + "step": 966, + "think_completion_length": 53.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.09375, + "epoch": 1.6323777403035413, + "grad_norm": 26.952546856783577, + "kl": 0.443359375, + "learning_rate": 6.738617200674535e-07, + "loss": 0.0004, + "reward": 3.182400941848755, + "reward_std": 0.2391754314303398, + "rewards/final_reward": 1.218758984763645, + "rewards/mask_iou_reward": 0.6093794923818225, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1824010014533997, + "rewards/thk_ans_format_reward": 1.0, + "step": 967, + "think_completion_length": 57.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.859375, + "epoch": 1.6340640809443507, + "grad_norm": 10.164153327570757, + "kl": 0.45703125, + "learning_rate": 6.735244519392917e-07, + "loss": 0.0005, + "reward": 3.2615323066711426, + "reward_std": 0.22615301050245762, + "rewards/final_reward": 1.009834806795975, + "rewards/mask_iou_reward": 0.5049174033979875, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2615323066711426, + "rewards/thk_ans_format_reward": 1.0, + "step": 968, + "think_completion_length": 62.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.25, + "epoch": 1.6357504215851602, + "grad_norm": 5.7572682275727995, + "kl": 0.46875, + "learning_rate": 6.731871838111298e-07, + "loss": 0.0005, + "reward": 3.587040066719055, + "reward_std": 0.24026421457529068, + "rewards/final_reward": 1.3034248229880045, + "rewards/mask_iou_reward": 0.6517124114940023, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5870400071144104, + "rewards/thk_ans_format_reward": 1.0, + "step": 969, + "think_completion_length": 52.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.4375, + "epoch": 1.6374367622259696, + "grad_norm": 8.103092577365828, + "kl": 0.470703125, + "learning_rate": 6.728499156829679e-07, + "loss": 0.0005, + "reward": 2.8985953330993652, + "reward_std": 0.3215479403734207, + "rewards/final_reward": 0.6277226580931698, + "rewards/mask_iou_reward": 0.3138613290465849, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8985952436923981, + "rewards/thk_ans_format_reward": 1.0, + "step": 970, + "think_completion_length": 56.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.84375, + "epoch": 1.6391231028667792, + "grad_norm": 7.764069334142484, + "kl": 0.537109375, + "learning_rate": 6.725126475548061e-07, + "loss": 0.0005, + "reward": 3.494862914085388, + "reward_std": 0.044202063232660294, + "rewards/final_reward": 1.3648452716227641, + "rewards/mask_iou_reward": 0.6824226358113821, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4948627352714539, + "rewards/thk_ans_format_reward": 1.0, + "step": 971, + "think_completion_length": 50.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.75, + "epoch": 1.6408094435075884, + "grad_norm": 7.854660442406088, + "kl": 0.525390625, + "learning_rate": 6.721753794266442e-07, + "loss": 0.0005, + "reward": 2.56212317943573, + "reward_std": 0.1159486398100853, + "rewards/final_reward": 1.0332222703548855, + "rewards/mask_iou_reward": 0.5166111351774427, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5621231943368912, + "rewards/thk_ans_format_reward": 1.0, + "step": 972, + "think_completion_length": 57.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.984375, + "epoch": 1.642495784148398, + "grad_norm": 5.779886052799433, + "kl": 0.4365234375, + "learning_rate": 6.718381112984823e-07, + "loss": 0.0004, + "reward": 3.4496811628341675, + "reward_std": 0.07857851311564445, + "rewards/final_reward": 1.3060323556255151, + "rewards/mask_iou_reward": 0.6530161778127576, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4496811628341675, + "rewards/thk_ans_format_reward": 1.0, + "step": 973, + "think_completion_length": 52.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.0, + "epoch": 1.6441821247892074, + "grad_norm": 197.52017044670364, + "kl": 0.4443359375, + "learning_rate": 6.715008431703204e-07, + "loss": 0.0004, + "reward": 3.357508063316345, + "reward_std": 0.13863565400242805, + "rewards/final_reward": 1.5934704967616962, + "rewards/mask_iou_reward": 0.7967352483808481, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3575079441070557, + "rewards/thk_ans_format_reward": 1.0, + "step": 974, + "think_completion_length": 52.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.296875, + "epoch": 1.6458684654300169, + "grad_norm": 13.268391687877173, + "kl": 0.552734375, + "learning_rate": 6.711635750421585e-07, + "loss": 0.0006, + "reward": 3.3350095748901367, + "reward_std": 0.19416646659374237, + "rewards/final_reward": 1.1799294280478632, + "rewards/mask_iou_reward": 0.5899647140239316, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.335009515285492, + "rewards/thk_ans_format_reward": 1.0, + "step": 975, + "think_completion_length": 60.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.671875, + "epoch": 1.6475548060708263, + "grad_norm": 5.910386625262815, + "kl": 0.43359375, + "learning_rate": 6.708263069139965e-07, + "loss": 0.0004, + "reward": 3.4555543661117554, + "reward_std": 0.04451208934187889, + "rewards/final_reward": 1.316332445803982, + "rewards/mask_iou_reward": 0.658166222901991, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4555545449256897, + "rewards/thk_ans_format_reward": 1.0, + "step": 976, + "think_completion_length": 53.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.015625, + "epoch": 1.6492411467116357, + "grad_norm": 5.538920499109146, + "kl": 0.46875, + "learning_rate": 6.704890387858347e-07, + "loss": 0.0005, + "reward": 2.931230068206787, + "reward_std": 0.12052519991993904, + "rewards/final_reward": 0.8966643714602006, + "rewards/mask_iou_reward": 0.4483321857301003, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.9468550682067871, + "rewards/thk_ans_format_reward": 1.0, + "step": 977, + "think_completion_length": 60.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.921875, + "epoch": 1.6509274873524453, + "grad_norm": 7.054514546876495, + "kl": 0.470703125, + "learning_rate": 6.701517706576728e-07, + "loss": 0.0005, + "reward": 3.2970367670059204, + "reward_std": 0.3092042412608862, + "rewards/final_reward": 1.645884152817545, + "rewards/mask_iou_reward": 0.8229420764087725, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2970367670059204, + "rewards/thk_ans_format_reward": 1.0, + "step": 978, + "think_completion_length": 54.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.546875, + "epoch": 1.6526138279932545, + "grad_norm": 8.006301481962957, + "kl": 0.4638671875, + "learning_rate": 6.698145025295109e-07, + "loss": 0.0005, + "reward": 3.2908164262771606, + "reward_std": 0.1687500774860382, + "rewards/final_reward": 0.8286613290487709, + "rewards/mask_iou_reward": 0.41433066452438544, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2908163666725159, + "rewards/thk_ans_format_reward": 1.0, + "step": 979, + "think_completion_length": 51.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.953125, + "epoch": 1.6543001686340641, + "grad_norm": 3.8420590887205512, + "kl": 0.458984375, + "learning_rate": 6.694772344013491e-07, + "loss": 0.0005, + "reward": 3.4131717681884766, + "reward_std": 0.06533291470259428, + "rewards/final_reward": 1.1531764423339341, + "rewards/mask_iou_reward": 0.5765882211669671, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4131719470024109, + "rewards/thk_ans_format_reward": 1.0, + "step": 980, + "think_completion_length": 47.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.53125, + "epoch": 1.6559865092748736, + "grad_norm": 5.036403987186729, + "kl": 0.48828125, + "learning_rate": 6.691399662731872e-07, + "loss": 0.0005, + "reward": 3.548070192337036, + "reward_std": 0.2200283706188202, + "rewards/final_reward": 1.390233837634983, + "rewards/mask_iou_reward": 0.6951169188174915, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5480701923370361, + "rewards/thk_ans_format_reward": 1.0, + "step": 981, + "think_completion_length": 54.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.109375, + "epoch": 1.657672849915683, + "grad_norm": 7.029428131198568, + "kl": 0.412109375, + "learning_rate": 6.688026981450252e-07, + "loss": 0.0004, + "reward": 3.329420566558838, + "reward_std": 0.08415714651346207, + "rewards/final_reward": 1.756476899069994, + "rewards/mask_iou_reward": 0.878238449534997, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3294204473495483, + "rewards/thk_ans_format_reward": 1.0, + "step": 982, + "think_completion_length": 50.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.125, + "epoch": 1.6593591905564924, + "grad_norm": 8.335193513559757, + "kl": 0.5380859375, + "learning_rate": 6.684654300168634e-07, + "loss": 0.0005, + "reward": 3.044628858566284, + "reward_std": 0.12631105724722147, + "rewards/final_reward": 1.3542042297420498, + "rewards/mask_iou_reward": 0.6771021148710249, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0446289479732513, + "rewards/thk_ans_format_reward": 1.0, + "step": 983, + "think_completion_length": 45.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.734375, + "epoch": 1.6610455311973018, + "grad_norm": 5.917140641040661, + "kl": 0.4580078125, + "learning_rate": 6.681281618887014e-07, + "loss": 0.0005, + "reward": 3.469908595085144, + "reward_std": 0.2401201520115137, + "rewards/final_reward": 1.538840379611916, + "rewards/mask_iou_reward": 0.769420189805958, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.485533595085144, + "rewards/thk_ans_format_reward": 1.0, + "step": 984, + "think_completion_length": 55.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.421875, + "epoch": 1.6627318718381114, + "grad_norm": 12.243138457321537, + "kl": 0.455078125, + "learning_rate": 6.677908937605396e-07, + "loss": 0.0005, + "reward": 2.940201163291931, + "reward_std": 0.31569964066147804, + "rewards/final_reward": 0.9419295381271846, + "rewards/mask_iou_reward": 0.4709647690635923, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.9558261930942535, + "rewards/thk_ans_format_reward": 1.0, + "step": 985, + "think_completion_length": 50.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.015625, + "epoch": 1.6644182124789206, + "grad_norm": 5.822222361871982, + "kl": 0.509765625, + "learning_rate": 6.674536256323777e-07, + "loss": 0.0005, + "reward": 3.062288284301758, + "reward_std": 0.13281331211328506, + "rewards/final_reward": 1.1950692639241447, + "rewards/mask_iou_reward": 0.5975346319620723, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0622882843017578, + "rewards/thk_ans_format_reward": 1.0, + "step": 986, + "think_completion_length": 56.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.109375, + "epoch": 1.6661045531197303, + "grad_norm": 6.756491763935447, + "kl": 0.474609375, + "learning_rate": 6.671163575042158e-07, + "loss": 0.0005, + "reward": 2.8552377223968506, + "reward_std": 0.24670583754777908, + "rewards/final_reward": 0.6170252546302251, + "rewards/mask_iou_reward": 0.30851262731511253, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.8708627223968506, + "rewards/thk_ans_format_reward": 1.0, + "step": 987, + "think_completion_length": 52.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.578125, + "epoch": 1.6677908937605397, + "grad_norm": 6.603201839448816, + "kl": 0.443359375, + "learning_rate": 6.66779089376054e-07, + "loss": 0.0004, + "reward": 3.013177275657654, + "reward_std": 0.22320347279310226, + "rewards/final_reward": 0.7690452230863198, + "rewards/mask_iou_reward": 0.3845226115431599, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0131773054599762, + "rewards/thk_ans_format_reward": 1.0, + "step": 988, + "think_completion_length": 50.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.125, + "epoch": 1.669477234401349, + "grad_norm": 6.814146979881974, + "kl": 0.52734375, + "learning_rate": 6.664418212478921e-07, + "loss": 0.0005, + "reward": 3.1289366483688354, + "reward_std": 0.22629151493310928, + "rewards/final_reward": 1.236931896683771, + "rewards/mask_iou_reward": 0.6184659483418855, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.1445615887641907, + "rewards/thk_ans_format_reward": 1.0, + "step": 989, + "think_completion_length": 46.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.390625, + "epoch": 1.6711635750421585, + "grad_norm": 7.369379403584672, + "kl": 0.4375, + "learning_rate": 6.661045531197301e-07, + "loss": 0.0004, + "reward": 3.5929524898529053, + "reward_std": 0.19491755589842796, + "rewards/final_reward": 1.80680461215357, + "rewards/mask_iou_reward": 0.903402306076785, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5929523706436157, + "rewards/thk_ans_format_reward": 1.0, + "step": 990, + "think_completion_length": 55.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.140625, + "epoch": 1.672849915682968, + "grad_norm": 8.968397003633898, + "kl": 0.458984375, + "learning_rate": 6.657672849915683e-07, + "loss": 0.0005, + "reward": 3.1989855766296387, + "reward_std": 0.4630395621061325, + "rewards/final_reward": 0.8420022603292938, + "rewards/mask_iou_reward": 0.4210011301646469, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.2146106958389282, + "rewards/thk_ans_format_reward": 1.0, + "step": 991, + "think_completion_length": 56.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.234375, + "epoch": 1.6745362563237776, + "grad_norm": 4.908390198867559, + "kl": 0.462890625, + "learning_rate": 6.654300168634064e-07, + "loss": 0.0005, + "reward": 3.2098724842071533, + "reward_std": 0.11346443742513657, + "rewards/final_reward": 1.7597471616814055, + "rewards/mask_iou_reward": 0.8798735808407028, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2098724842071533, + "rewards/thk_ans_format_reward": 1.0, + "step": 992, + "think_completion_length": 44.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.15625, + "epoch": 1.6762225969645868, + "grad_norm": 18.70764305875646, + "kl": 0.52734375, + "learning_rate": 6.650927487352444e-07, + "loss": 0.0005, + "reward": 2.8987958431243896, + "reward_std": 0.09079464711248875, + "rewards/final_reward": 0.8476097891744324, + "rewards/mask_iou_reward": 0.4238048945872162, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8987958431243896, + "rewards/thk_ans_format_reward": 1.0, + "step": 993, + "think_completion_length": 54.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.78125, + "epoch": 1.6779089376053964, + "grad_norm": 18.73896888137206, + "kl": 0.513671875, + "learning_rate": 6.647554806070826e-07, + "loss": 0.0005, + "reward": 3.5080639123916626, + "reward_std": 0.06798750162124634, + "rewards/final_reward": 1.4192495858629686, + "rewards/mask_iou_reward": 0.7096247929314843, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5080639123916626, + "rewards/thk_ans_format_reward": 1.0, + "step": 994, + "think_completion_length": 48.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.46875, + "epoch": 1.6795952782462056, + "grad_norm": 12.262342821384445, + "kl": 0.44140625, + "learning_rate": 6.644182124789207e-07, + "loss": 0.0004, + "reward": 3.409273624420166, + "reward_std": 0.3849910721182823, + "rewards/final_reward": 1.5313665169886965, + "rewards/mask_iou_reward": 0.7656832584943483, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 1.4717735648155212, + "rewards/thk_ans_format_reward": 0.96875, + "step": 995, + "think_completion_length": 47.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.359375, + "epoch": 1.6812816188870152, + "grad_norm": 8.38298900781314, + "kl": 0.4580078125, + "learning_rate": 6.640809443507588e-07, + "loss": 0.0005, + "reward": 3.8332111835479736, + "reward_std": 0.04581199027597904, + "rewards/final_reward": 1.7347238305560204, + "rewards/mask_iou_reward": 0.8673619152780102, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8332111835479736, + "rewards/thk_ans_format_reward": 1.0, + "step": 996, + "think_completion_length": 44.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.890625, + "epoch": 1.6829679595278246, + "grad_norm": 46.42672517479715, + "kl": 1.2060546875, + "learning_rate": 6.63743676222597e-07, + "loss": 0.0012, + "reward": 2.7516547441482544, + "reward_std": 0.07285407930612564, + "rewards/final_reward": 0.6759741659116778, + "rewards/mask_iou_reward": 0.3379870829558389, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7516548335552216, + "rewards/thk_ans_format_reward": 1.0, + "step": 997, + "think_completion_length": 50.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.4375, + "epoch": 1.684654300168634, + "grad_norm": 4.497527571240939, + "kl": 0.4501953125, + "learning_rate": 6.63406408094435e-07, + "loss": 0.0005, + "reward": 3.022408366203308, + "reward_std": 0.2865653783082962, + "rewards/final_reward": 1.0426820398855086, + "rewards/mask_iou_reward": 0.5213410199427543, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0224083065986633, + "rewards/thk_ans_format_reward": 1.0, + "step": 998, + "think_completion_length": 57.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.421875, + "epoch": 1.6863406408094435, + "grad_norm": 15.870394792828607, + "kl": 0.5126953125, + "learning_rate": 6.630691399662731e-07, + "loss": 0.0005, + "reward": 3.2935569286346436, + "reward_std": 0.2325892373919487, + "rewards/final_reward": 1.3165577341999402, + "rewards/mask_iou_reward": 0.6582788670999701, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2935569882392883, + "rewards/thk_ans_format_reward": 1.0, + "step": 999, + "think_completion_length": 44.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.90625, + "epoch": 1.6880269814502529, + "grad_norm": 7.690590543519975, + "kl": 0.5068359375, + "learning_rate": 6.627318718381113e-07, + "loss": 0.0005, + "reward": 3.773517370223999, + "reward_std": 0.15830123564228415, + "rewards/final_reward": 1.6504267851557586, + "rewards/mask_iou_reward": 0.8252133925778793, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.773517370223999, + "rewards/thk_ans_format_reward": 1.0, + "step": 1000, + "think_completion_length": 52.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.90625, + "epoch": 1.6897133220910625, + "grad_norm": 21.874063040209965, + "kl": 0.603515625, + "learning_rate": 6.623946037099494e-07, + "loss": 0.0006, + "reward": 3.3424084186553955, + "reward_std": 0.10551745275733992, + "rewards/final_reward": 1.1636657158449948, + "rewards/mask_iou_reward": 0.5818328579224974, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3424084186553955, + "rewards/thk_ans_format_reward": 1.0, + "step": 1001, + "think_completion_length": 48.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.484375, + "epoch": 1.6913996627318717, + "grad_norm": 9.269280540478269, + "kl": 0.4208984375, + "learning_rate": 6.620573355817874e-07, + "loss": 0.0004, + "reward": 3.7046056985855103, + "reward_std": 0.07972065731883049, + "rewards/final_reward": 1.7708265223477202, + "rewards/mask_iou_reward": 0.8854132611738601, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7046056985855103, + "rewards/thk_ans_format_reward": 1.0, + "step": 1002, + "think_completion_length": 59.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.765625, + "epoch": 1.6930860033726813, + "grad_norm": 9.109123179163847, + "kl": 0.494140625, + "learning_rate": 6.617200674536256e-07, + "loss": 0.0005, + "reward": 2.643565535545349, + "reward_std": 0.16619166731834412, + "rewards/final_reward": 0.28977825331355156, + "rewards/mask_iou_reward": 0.14488912665677578, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6435655355453491, + "rewards/thk_ans_format_reward": 1.0, + "step": 1003, + "think_completion_length": 51.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.0, + "epoch": 1.6947723440134908, + "grad_norm": 15.214547323566004, + "kl": 0.498046875, + "learning_rate": 6.613827993254637e-07, + "loss": 0.0005, + "reward": 3.6040256023406982, + "reward_std": 0.1329221185296774, + "rewards/final_reward": 1.6264060886231384, + "rewards/mask_iou_reward": 0.8132030443115692, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6040256023406982, + "rewards/thk_ans_format_reward": 1.0, + "step": 1004, + "think_completion_length": 51.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.796875, + "epoch": 1.6964586846543002, + "grad_norm": 5.992950266206152, + "kl": 0.455078125, + "learning_rate": 6.610455311973018e-07, + "loss": 0.0005, + "reward": 3.3491801023483276, + "reward_std": 0.16978841368108988, + "rewards/final_reward": 1.5574006048073583, + "rewards/mask_iou_reward": 0.7787003024036792, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3491801619529724, + "rewards/thk_ans_format_reward": 1.0, + "step": 1005, + "think_completion_length": 53.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.71875, + "epoch": 1.6981450252951096, + "grad_norm": 12.419592591697839, + "kl": 0.4609375, + "learning_rate": 6.6070826306914e-07, + "loss": 0.0005, + "reward": 3.398915648460388, + "reward_std": 0.23545659333467484, + "rewards/final_reward": 1.6019027950726148, + "rewards/mask_iou_reward": 0.8009513975363074, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3989156484603882, + "rewards/thk_ans_format_reward": 1.0, + "step": 1006, + "think_completion_length": 59.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.109375, + "epoch": 1.699831365935919, + "grad_norm": 3.8120154688685104, + "kl": 0.41796875, + "learning_rate": 6.60370994940978e-07, + "loss": 0.0004, + "reward": 2.801236391067505, + "reward_std": 0.23255718499422073, + "rewards/final_reward": 0.7371322477079272, + "rewards/mask_iou_reward": 0.3685661238539636, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8012365102767944, + "rewards/thk_ans_format_reward": 1.0, + "step": 1007, + "think_completion_length": 48.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.078125, + "epoch": 1.7015177065767286, + "grad_norm": 7.748580989294214, + "kl": 0.470703125, + "learning_rate": 6.600337268128161e-07, + "loss": 0.0005, + "reward": 3.6431901454925537, + "reward_std": 0.045935716829262674, + "rewards/final_reward": 1.5224881620611639, + "rewards/mask_iou_reward": 0.7612440810305819, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6431901454925537, + "rewards/thk_ans_format_reward": 1.0, + "step": 1008, + "think_completion_length": 47.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.171875, + "epoch": 1.7032040472175378, + "grad_norm": 5.737195692191415, + "kl": 0.494140625, + "learning_rate": 6.596964586846543e-07, + "loss": 0.0005, + "reward": 3.3539552688598633, + "reward_std": 0.16605842299759388, + "rewards/final_reward": 1.6803083781412165, + "rewards/mask_iou_reward": 0.8401541890706082, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.353955328464508, + "rewards/thk_ans_format_reward": 1.0, + "step": 1009, + "think_completion_length": 52.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.171875, + "epoch": 1.7048903878583475, + "grad_norm": 7.962368314621498, + "kl": 0.548828125, + "learning_rate": 6.593591905564924e-07, + "loss": 0.0005, + "reward": 2.8921409845352173, + "reward_std": 0.24707718193531036, + "rewards/final_reward": 0.5459787002187156, + "rewards/mask_iou_reward": 0.2729893501093578, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8921409249305725, + "rewards/thk_ans_format_reward": 1.0, + "step": 1010, + "think_completion_length": 47.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.484375, + "epoch": 1.7065767284991569, + "grad_norm": 9.146835254517585, + "kl": 0.486328125, + "learning_rate": 6.590219224283306e-07, + "loss": 0.0005, + "reward": 3.0934470891952515, + "reward_std": 0.16183524578809738, + "rewards/final_reward": 0.7069903265430519, + "rewards/mask_iou_reward": 0.35349516327152597, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0934468805789948, + "rewards/thk_ans_format_reward": 1.0, + "step": 1011, + "think_completion_length": 53.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.203125, + "epoch": 1.7082630691399663, + "grad_norm": 13.175316522972508, + "kl": 0.4267578125, + "learning_rate": 6.586846543001686e-07, + "loss": 0.0005, + "reward": 3.282162666320801, + "reward_std": 0.18916182965040207, + "rewards/final_reward": 1.0668367459031367, + "rewards/mask_iou_reward": 0.5334183729515684, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2821626663208008, + "rewards/thk_ans_format_reward": 1.0, + "step": 1012, + "think_completion_length": 53.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.328125, + "epoch": 1.7099494097807757, + "grad_norm": 8.659680807686604, + "kl": 0.45703125, + "learning_rate": 6.583473861720067e-07, + "loss": 0.0005, + "reward": 3.1156177520751953, + "reward_std": 0.08516193181276321, + "rewards/final_reward": 0.5883904742783279, + "rewards/mask_iou_reward": 0.29419523713916396, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.11561781167984, + "rewards/thk_ans_format_reward": 1.0, + "step": 1013, + "think_completion_length": 52.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.734375, + "epoch": 1.7116357504215851, + "grad_norm": 18.041790037299258, + "kl": 0.556640625, + "learning_rate": 6.580101180438449e-07, + "loss": 0.0006, + "reward": 3.402442216873169, + "reward_std": 0.08341848477721214, + "rewards/final_reward": 1.1143318288277488, + "rewards/mask_iou_reward": 0.5571659144138744, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.402442216873169, + "rewards/thk_ans_format_reward": 1.0, + "step": 1014, + "think_completion_length": 51.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.46875, + "epoch": 1.7133220910623947, + "grad_norm": 11.7012394200138, + "kl": 0.49609375, + "learning_rate": 6.576728499156829e-07, + "loss": 0.0005, + "reward": 2.6576205492019653, + "reward_std": 0.22013526409864426, + "rewards/final_reward": 0.7089271269158894, + "rewards/mask_iou_reward": 0.3544635634579447, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6576206088066101, + "rewards/thk_ans_format_reward": 1.0, + "step": 1015, + "think_completion_length": 53.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.46875, + "epoch": 1.715008431703204, + "grad_norm": 4.359995063594274, + "kl": 0.4912109375, + "learning_rate": 6.57335581787521e-07, + "loss": 0.0005, + "reward": 3.22420072555542, + "reward_std": 0.28856465220451355, + "rewards/final_reward": 1.3810333083133202, + "rewards/mask_iou_reward": 0.6905166541566601, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.239825963973999, + "rewards/thk_ans_format_reward": 1.0, + "step": 1016, + "think_completion_length": 59.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.515625, + "epoch": 1.7166947723440136, + "grad_norm": 4.3461415710926, + "kl": 0.43359375, + "learning_rate": 6.569983136593592e-07, + "loss": 0.0004, + "reward": 3.1074719429016113, + "reward_std": 0.32912111282348633, + "rewards/final_reward": 1.0721299991151474, + "rewards/mask_iou_reward": 0.5360649995575737, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1074718832969666, + "rewards/thk_ans_format_reward": 1.0, + "step": 1017, + "think_completion_length": 51.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 213.421875, + "epoch": 1.718381112984823, + "grad_norm": 21.260604756308766, + "kl": 0.357421875, + "learning_rate": 6.566610455311973e-07, + "loss": 0.0004, + "reward": 2.964912176132202, + "reward_std": 0.21077851206064224, + "rewards/final_reward": 1.1033498150252896, + "rewards/mask_iou_reward": 0.5516749075126448, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9649122655391693, + "rewards/thk_ans_format_reward": 1.0, + "step": 1018, + "think_completion_length": 59.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.8125, + "epoch": 1.7200674536256324, + "grad_norm": 7.706174182028309, + "kl": 0.45703125, + "learning_rate": 6.563237774030354e-07, + "loss": 0.0005, + "reward": 2.96855366230011, + "reward_std": 0.18945128098130226, + "rewards/final_reward": 0.8479449831806979, + "rewards/mask_iou_reward": 0.42397249159034894, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9685536324977875, + "rewards/thk_ans_format_reward": 1.0, + "step": 1019, + "think_completion_length": 47.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.46875, + "epoch": 1.7217537942664418, + "grad_norm": 9.21032483206, + "kl": 0.470703125, + "learning_rate": 6.559865092748735e-07, + "loss": 0.0005, + "reward": 3.0714797973632812, + "reward_std": 0.18478820845484734, + "rewards/final_reward": 0.9969208253996952, + "rewards/mask_iou_reward": 0.4984604126998476, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.071479707956314, + "rewards/thk_ans_format_reward": 1.0, + "step": 1020, + "think_completion_length": 55.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.796875, + "epoch": 1.7234401349072512, + "grad_norm": 11.380041316059248, + "kl": 0.4501953125, + "learning_rate": 6.556492411467116e-07, + "loss": 0.0004, + "reward": 3.2528512477874756, + "reward_std": 0.24282580788712949, + "rewards/final_reward": 0.6978428253976936, + "rewards/mask_iou_reward": 0.3489214126988468, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2528512477874756, + "rewards/thk_ans_format_reward": 1.0, + "step": 1021, + "think_completion_length": 52.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.578125, + "epoch": 1.7251264755480609, + "grad_norm": 4.529523135510817, + "kl": 0.462890625, + "learning_rate": 6.553119730185497e-07, + "loss": 0.0005, + "reward": 3.537788510322571, + "reward_std": 0.17213429510593414, + "rewards/final_reward": 1.4058552515127203, + "rewards/mask_iou_reward": 0.7029276257563601, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.537788450717926, + "rewards/thk_ans_format_reward": 1.0, + "step": 1022, + "think_completion_length": 48.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.953125, + "epoch": 1.72681281618887, + "grad_norm": 7.594129251397134, + "kl": 0.552734375, + "learning_rate": 6.549747048903878e-07, + "loss": 0.0006, + "reward": 3.0862042903900146, + "reward_std": 0.09378309547901154, + "rewards/final_reward": 0.9123979252746075, + "rewards/mask_iou_reward": 0.45619896263730375, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0862043499946594, + "rewards/thk_ans_format_reward": 1.0, + "step": 1023, + "think_completion_length": 51.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.890625, + "epoch": 1.7284991568296797, + "grad_norm": 15.814763062458056, + "kl": 0.4609375, + "learning_rate": 6.546374367622259e-07, + "loss": 0.0005, + "reward": 3.47658109664917, + "reward_std": 0.09444395080208778, + "rewards/final_reward": 1.546256486336178, + "rewards/mask_iou_reward": 0.773128243168089, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4765812158584595, + "rewards/thk_ans_format_reward": 1.0, + "step": 1024, + "think_completion_length": 55.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.0625, + "epoch": 1.7301854974704889, + "grad_norm": 23.226417626108393, + "kl": 0.4951171875, + "learning_rate": 6.54300168634064e-07, + "loss": 0.0005, + "reward": 3.4563424587249756, + "reward_std": 0.15790753066539764, + "rewards/final_reward": 1.3368036721575525, + "rewards/mask_iou_reward": 0.6684018360787762, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4563424587249756, + "rewards/thk_ans_format_reward": 1.0, + "step": 1025, + "think_completion_length": 53.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.140625, + "epoch": 1.7318718381112985, + "grad_norm": 4.910500372680354, + "kl": 0.478515625, + "learning_rate": 6.539629005059022e-07, + "loss": 0.0005, + "reward": 3.1884742975234985, + "reward_std": 0.25331611186265945, + "rewards/final_reward": 1.78840388322968, + "rewards/mask_iou_reward": 0.89420194161484, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.204099178314209, + "rewards/thk_ans_format_reward": 1.0, + "step": 1026, + "think_completion_length": 51.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.390625, + "epoch": 1.733558178752108, + "grad_norm": 8.4172557883391, + "kl": 0.4970703125, + "learning_rate": 6.536256323777403e-07, + "loss": 0.0005, + "reward": 3.1042816638946533, + "reward_std": 0.08971688710153103, + "rewards/final_reward": 1.287853213224611, + "rewards/mask_iou_reward": 0.6439266066123055, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.104281485080719, + "rewards/thk_ans_format_reward": 1.0, + "step": 1027, + "think_completion_length": 51.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.875, + "epoch": 1.7352445193929174, + "grad_norm": 12.583209258616298, + "kl": 0.4970703125, + "learning_rate": 6.532883642495784e-07, + "loss": 0.0005, + "reward": 3.4771846532821655, + "reward_std": 0.13392280414700508, + "rewards/final_reward": 1.3426299767605006, + "rewards/mask_iou_reward": 0.6713149883802503, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4771845936775208, + "rewards/thk_ans_format_reward": 1.0, + "step": 1028, + "think_completion_length": 49.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.8125, + "epoch": 1.7369308600337268, + "grad_norm": 5.460307618750845, + "kl": 0.4345703125, + "learning_rate": 6.529510961214165e-07, + "loss": 0.0004, + "reward": 2.8394622802734375, + "reward_std": 0.2670469731092453, + "rewards/final_reward": 0.8895338814851627, + "rewards/mask_iou_reward": 0.44476694074258133, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.8550873398780823, + "rewards/thk_ans_format_reward": 1.0, + "step": 1029, + "think_completion_length": 48.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.109375, + "epoch": 1.7386172006745362, + "grad_norm": 4.352846224632927, + "kl": 0.4345703125, + "learning_rate": 6.526138279932546e-07, + "loss": 0.0004, + "reward": 3.856082320213318, + "reward_std": 0.011286142049357295, + "rewards/final_reward": 1.8986783874918174, + "rewards/mask_iou_reward": 0.9493391937459087, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8560824394226074, + "rewards/thk_ans_format_reward": 1.0, + "step": 1030, + "think_completion_length": 50.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.6875, + "epoch": 1.7403035413153458, + "grad_norm": 9.118879145324778, + "kl": 0.462890625, + "learning_rate": 6.522765598650926e-07, + "loss": 0.0005, + "reward": 3.309417724609375, + "reward_std": 0.452437125146389, + "rewards/final_reward": 1.1533700789648118, + "rewards/mask_iou_reward": 0.5766850394824059, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3094177842140198, + "rewards/thk_ans_format_reward": 1.0, + "step": 1031, + "think_completion_length": 47.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.671875, + "epoch": 1.741989881956155, + "grad_norm": 9.320656417334392, + "kl": 0.4638671875, + "learning_rate": 6.519392917369308e-07, + "loss": 0.0005, + "reward": 3.5183212757110596, + "reward_std": 0.22874368727207184, + "rewards/final_reward": 1.4453660173187608, + "rewards/mask_iou_reward": 0.7226830086593804, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5183210968971252, + "rewards/thk_ans_format_reward": 1.0, + "step": 1032, + "think_completion_length": 57.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.234375, + "epoch": 1.7436762225969646, + "grad_norm": 6.520407909354839, + "kl": 0.404296875, + "learning_rate": 6.516020236087689e-07, + "loss": 0.0004, + "reward": 2.8486337661743164, + "reward_std": 0.2386086881160736, + "rewards/final_reward": 0.8908258070687297, + "rewards/mask_iou_reward": 0.44541290353436486, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.848633736371994, + "rewards/thk_ans_format_reward": 1.0, + "step": 1033, + "think_completion_length": 47.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.703125, + "epoch": 1.745362563237774, + "grad_norm": 41.13201708403674, + "kl": 0.515625, + "learning_rate": 6.51264755480607e-07, + "loss": 0.0005, + "reward": 3.380680561065674, + "reward_std": 0.17691625840961933, + "rewards/final_reward": 1.2292933076821977, + "rewards/mask_iou_reward": 0.6146466538410988, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3806805610656738, + "rewards/thk_ans_format_reward": 1.0, + "step": 1034, + "think_completion_length": 48.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.953125, + "epoch": 1.7470489038785835, + "grad_norm": 6.130999116033341, + "kl": 0.4736328125, + "learning_rate": 6.509274873524452e-07, + "loss": 0.0004, + "reward": 3.4069515466690063, + "reward_std": 0.24042115407064557, + "rewards/final_reward": 1.7034202245018428, + "rewards/mask_iou_reward": 0.8517101122509214, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4069515466690063, + "rewards/thk_ans_format_reward": 1.0, + "step": 1035, + "think_completion_length": 52.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.015625, + "epoch": 1.7487352445193929, + "grad_norm": 5.4172837704426575, + "kl": 0.4267578125, + "learning_rate": 6.505902192242833e-07, + "loss": 0.0004, + "reward": 2.547904133796692, + "reward_std": 0.11162854917347431, + "rewards/final_reward": 0.0386555181927269, + "rewards/mask_iou_reward": 0.01932775909636345, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5479041039943695, + "rewards/thk_ans_format_reward": 1.0, + "step": 1036, + "think_completion_length": 56.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.5, + "epoch": 1.7504215851602023, + "grad_norm": 33.880899390677925, + "kl": 0.46484375, + "learning_rate": 6.502529510961215e-07, + "loss": 0.0005, + "reward": 3.2491201162338257, + "reward_std": 0.16262406716123223, + "rewards/final_reward": 1.4198772441300478, + "rewards/mask_iou_reward": 0.7099386220650239, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2647451758384705, + "rewards/thk_ans_format_reward": 0.984375, + "step": 1037, + "think_completion_length": 49.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.59375, + "epoch": 1.752107925801012, + "grad_norm": 31.689644962105042, + "kl": 0.513671875, + "learning_rate": 6.499156829679595e-07, + "loss": 0.0005, + "reward": 3.308144211769104, + "reward_std": 0.06957734003663063, + "rewards/final_reward": 1.5038740533194743, + "rewards/mask_iou_reward": 0.7519370266597372, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3081441521644592, + "rewards/thk_ans_format_reward": 1.0, + "step": 1038, + "think_completion_length": 50.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.359375, + "epoch": 1.7537942664418211, + "grad_norm": 16.50416060782597, + "kl": 0.8203125, + "learning_rate": 6.495784148397976e-07, + "loss": 0.0008, + "reward": 2.7542325258255005, + "reward_std": 0.45590740442276, + "rewards/final_reward": 1.0102187496451316, + "rewards/mask_iou_reward": 0.5051093748225658, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7542325109243393, + "rewards/thk_ans_format_reward": 1.0, + "step": 1039, + "think_completion_length": 52.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.640625, + "epoch": 1.7554806070826308, + "grad_norm": 6.722045575335457, + "kl": 0.4248046875, + "learning_rate": 6.492411467116357e-07, + "loss": 0.0004, + "reward": 3.158905863761902, + "reward_std": 0.24892936274409294, + "rewards/final_reward": 1.0586207550274105, + "rewards/mask_iou_reward": 0.5293103775137052, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1589058637619019, + "rewards/thk_ans_format_reward": 1.0, + "step": 1040, + "think_completion_length": 51.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.921875, + "epoch": 1.7571669477234402, + "grad_norm": 9.194304976346283, + "kl": 0.4462890625, + "learning_rate": 6.489038785834738e-07, + "loss": 0.0004, + "reward": 3.509106397628784, + "reward_std": 0.1190731879323721, + "rewards/final_reward": 1.588458335658146, + "rewards/mask_iou_reward": 0.794229167829073, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5091063976287842, + "rewards/thk_ans_format_reward": 1.0, + "step": 1041, + "think_completion_length": 65.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.640625, + "epoch": 1.7588532883642496, + "grad_norm": 25.54207241093614, + "kl": 0.46484375, + "learning_rate": 6.485666104553119e-07, + "loss": 0.0005, + "reward": 3.1248477697372437, + "reward_std": 0.30804644525051117, + "rewards/final_reward": 1.4215654945130256, + "rewards/mask_iou_reward": 0.7107827472565128, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1248477101325989, + "rewards/thk_ans_format_reward": 1.0, + "step": 1042, + "think_completion_length": 55.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.40625, + "epoch": 1.760539629005059, + "grad_norm": 6.3995610391893, + "kl": 0.431640625, + "learning_rate": 6.482293423271501e-07, + "loss": 0.0004, + "reward": 3.5307756662368774, + "reward_std": 0.27365532889962196, + "rewards/final_reward": 1.4586337692907971, + "rewards/mask_iou_reward": 0.7293168846453986, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.530775785446167, + "rewards/thk_ans_format_reward": 1.0, + "step": 1043, + "think_completion_length": 51.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.765625, + "epoch": 1.7622259696458684, + "grad_norm": 13.867074363938958, + "kl": 0.5029296875, + "learning_rate": 6.478920741989882e-07, + "loss": 0.0005, + "reward": 3.391425848007202, + "reward_std": 0.10993809998035431, + "rewards/final_reward": 1.2119668066644858, + "rewards/mask_iou_reward": 0.6059834033322429, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3914258480072021, + "rewards/thk_ans_format_reward": 1.0, + "step": 1044, + "think_completion_length": 48.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.625, + "epoch": 1.763912310286678, + "grad_norm": 3.4706313534724047, + "kl": 0.447265625, + "learning_rate": 6.475548060708263e-07, + "loss": 0.0004, + "reward": 2.8153789043426514, + "reward_std": 0.04241855535656214, + "rewards/final_reward": 0.18875813571196698, + "rewards/mask_iou_reward": 0.09437906785598349, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8153788447380066, + "rewards/thk_ans_format_reward": 1.0, + "step": 1045, + "think_completion_length": 55.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.03125, + "epoch": 1.7655986509274872, + "grad_norm": 4.695301218534343, + "kl": 0.5595703125, + "learning_rate": 6.472175379426645e-07, + "loss": 0.0005, + "reward": 3.5663245916366577, + "reward_std": 0.0903189332166221, + "rewards/final_reward": 1.5521931867676244, + "rewards/mask_iou_reward": 0.7760965933838122, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5663246512413025, + "rewards/thk_ans_format_reward": 1.0, + "step": 1046, + "think_completion_length": 50.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.21875, + "epoch": 1.7672849915682969, + "grad_norm": 18.269976905096588, + "kl": 0.4765625, + "learning_rate": 6.468802698145025e-07, + "loss": 0.0005, + "reward": 3.310370087623596, + "reward_std": 0.0983478156849742, + "rewards/final_reward": 1.2876556004640443, + "rewards/mask_iou_reward": 0.6438278002320221, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3103700280189514, + "rewards/thk_ans_format_reward": 1.0, + "step": 1047, + "think_completion_length": 48.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.140625, + "epoch": 1.768971332209106, + "grad_norm": 20.295614577115845, + "kl": 0.427734375, + "learning_rate": 6.465430016863405e-07, + "loss": 0.0004, + "reward": 2.9448623657226562, + "reward_std": 0.22916459874249995, + "rewards/final_reward": 0.40654357120444046, + "rewards/mask_iou_reward": 0.20327178560222023, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9448623657226562, + "rewards/thk_ans_format_reward": 1.0, + "step": 1048, + "think_completion_length": 53.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.78125, + "epoch": 1.7706576728499157, + "grad_norm": 9.65434244461805, + "kl": 0.4228515625, + "learning_rate": 6.462057335581787e-07, + "loss": 0.0004, + "reward": 3.411288022994995, + "reward_std": 0.21771667152643204, + "rewards/final_reward": 1.098073992844263, + "rewards/mask_iou_reward": 0.5490369964221316, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4112881422042847, + "rewards/thk_ans_format_reward": 1.0, + "step": 1049, + "think_completion_length": 71.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.90625, + "epoch": 1.7723440134907251, + "grad_norm": 12.128173202319173, + "kl": 0.4677734375, + "learning_rate": 6.458684654300168e-07, + "loss": 0.0005, + "reward": 3.588056802749634, + "reward_std": 0.1198611631989479, + "rewards/final_reward": 1.7642558749700097, + "rewards/mask_iou_reward": 0.8821279374850048, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5880568027496338, + "rewards/thk_ans_format_reward": 1.0, + "step": 1050, + "think_completion_length": 50.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.59375, + "epoch": 1.7740303541315345, + "grad_norm": 9.552772331867274, + "kl": 0.4638671875, + "learning_rate": 6.455311973018549e-07, + "loss": 0.0005, + "reward": 2.649136185646057, + "reward_std": 0.09959585964679718, + "rewards/final_reward": 0.70501502889627, + "rewards/mask_iou_reward": 0.352507514448135, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6491361558437347, + "rewards/thk_ans_format_reward": 1.0, + "step": 1051, + "think_completion_length": 54.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.5625, + "epoch": 1.7757166947723442, + "grad_norm": 7.4506685071560455, + "kl": 0.51953125, + "learning_rate": 6.451939291736931e-07, + "loss": 0.0005, + "reward": 3.7473593950271606, + "reward_std": 0.021921713836491108, + "rewards/final_reward": 1.6334566791793033, + "rewards/mask_iou_reward": 0.8167283395896516, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7473594546318054, + "rewards/thk_ans_format_reward": 1.0, + "step": 1052, + "think_completion_length": 68.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.25, + "epoch": 1.7774030354131534, + "grad_norm": 11.452336974810411, + "kl": 0.431640625, + "learning_rate": 6.448566610455312e-07, + "loss": 0.0004, + "reward": 3.4113532304763794, + "reward_std": 0.15724964579567313, + "rewards/final_reward": 1.3157614341154118, + "rewards/mask_iou_reward": 0.6578807170577059, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4113531708717346, + "rewards/thk_ans_format_reward": 1.0, + "step": 1053, + "think_completion_length": 60.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.375, + "epoch": 1.779089376053963, + "grad_norm": 5.636753144188764, + "kl": 0.4638671875, + "learning_rate": 6.445193929173693e-07, + "loss": 0.0005, + "reward": 2.8758299350738525, + "reward_std": 0.24051348865032196, + "rewards/final_reward": 0.7168088626733424, + "rewards/mask_iou_reward": 0.3584044313366712, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8758300244808197, + "rewards/thk_ans_format_reward": 1.0, + "step": 1054, + "think_completion_length": 51.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.921875, + "epoch": 1.7807757166947722, + "grad_norm": 9.697318267357094, + "kl": 0.5068359375, + "learning_rate": 6.441821247892075e-07, + "loss": 0.0005, + "reward": 3.427851915359497, + "reward_std": 0.16114804474636912, + "rewards/final_reward": 1.7093380725822347, + "rewards/mask_iou_reward": 0.8546690362911173, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4278518557548523, + "rewards/thk_ans_format_reward": 1.0, + "step": 1055, + "think_completion_length": 49.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.75, + "epoch": 1.7824620573355818, + "grad_norm": 5.99894941422999, + "kl": 0.4765625, + "learning_rate": 6.438448566610454e-07, + "loss": 0.0005, + "reward": 3.261791706085205, + "reward_std": 0.26013752818107605, + "rewards/final_reward": 1.0026498346326282, + "rewards/mask_iou_reward": 0.5013249173163141, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2617915272712708, + "rewards/thk_ans_format_reward": 1.0, + "step": 1056, + "think_completion_length": 55.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.890625, + "epoch": 1.7841483979763912, + "grad_norm": 18.634833449472875, + "kl": 0.5947265625, + "learning_rate": 6.435075885328835e-07, + "loss": 0.0006, + "reward": 3.4682400226593018, + "reward_std": 0.09191236272454262, + "rewards/final_reward": 1.3746822163096801, + "rewards/mask_iou_reward": 0.6873411081548401, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4682400822639465, + "rewards/thk_ans_format_reward": 1.0, + "step": 1057, + "think_completion_length": 52.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.140625, + "epoch": 1.7858347386172007, + "grad_norm": 13.903214719949169, + "kl": 0.3984375, + "learning_rate": 6.431703204047217e-07, + "loss": 0.0004, + "reward": 3.124882459640503, + "reward_std": 0.22556371614336967, + "rewards/final_reward": 1.3507318497634557, + "rewards/mask_iou_reward": 0.6753659248817279, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.124882459640503, + "rewards/thk_ans_format_reward": 1.0, + "step": 1058, + "think_completion_length": 62.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.609375, + "epoch": 1.78752107925801, + "grad_norm": 6.8004779635053465, + "kl": 0.4423828125, + "learning_rate": 6.428330522765598e-07, + "loss": 0.0004, + "reward": 3.142040967941284, + "reward_std": 0.05950396414846182, + "rewards/final_reward": 1.3599471646303916, + "rewards/mask_iou_reward": 0.6799735823151958, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1420409381389618, + "rewards/thk_ans_format_reward": 1.0, + "step": 1059, + "think_completion_length": 52.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.25, + "epoch": 1.7892074198988195, + "grad_norm": 11.336990333878964, + "kl": 0.43359375, + "learning_rate": 6.42495784148398e-07, + "loss": 0.0004, + "reward": 3.2227389812469482, + "reward_std": 0.18827488273382187, + "rewards/final_reward": 1.7645477845927253, + "rewards/mask_iou_reward": 0.8822738922963627, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2227389216423035, + "rewards/thk_ans_format_reward": 1.0, + "step": 1060, + "think_completion_length": 47.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.25, + "epoch": 1.7908937605396291, + "grad_norm": 12.35064431303953, + "kl": 0.4462890625, + "learning_rate": 6.421585160202361e-07, + "loss": 0.0004, + "reward": 3.690014123916626, + "reward_std": 0.12409292161464691, + "rewards/final_reward": 1.561800469907261, + "rewards/mask_iou_reward": 0.7809002349536305, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6900140643119812, + "rewards/thk_ans_format_reward": 1.0, + "step": 1061, + "think_completion_length": 56.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.171875, + "epoch": 1.7925801011804383, + "grad_norm": 5.196701064961896, + "kl": 0.453125, + "learning_rate": 6.418212478920742e-07, + "loss": 0.0004, + "reward": 3.6945523023605347, + "reward_std": 0.13177293725311756, + "rewards/final_reward": 1.6393652382683739, + "rewards/mask_iou_reward": 0.8196826191341869, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6945521235466003, + "rewards/thk_ans_format_reward": 1.0, + "step": 1062, + "think_completion_length": 52.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.9375, + "epoch": 1.794266441821248, + "grad_norm": 32.92921107957651, + "kl": 0.4814453125, + "learning_rate": 6.414839797639124e-07, + "loss": 0.0005, + "reward": 2.8126556873321533, + "reward_std": 0.27009210735559464, + "rewards/final_reward": 0.7486795351149669, + "rewards/mask_iou_reward": 0.37433976755748344, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8126558065414429, + "rewards/thk_ans_format_reward": 1.0, + "step": 1063, + "think_completion_length": 55.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.4375, + "epoch": 1.7959527824620574, + "grad_norm": 16.048422949673412, + "kl": 0.53125, + "learning_rate": 6.411467116357505e-07, + "loss": 0.0005, + "reward": 3.3275065422058105, + "reward_std": 0.04513479955494404, + "rewards/final_reward": 1.114819221313844, + "rewards/mask_iou_reward": 0.557409610656922, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3275066614151, + "rewards/thk_ans_format_reward": 1.0, + "step": 1064, + "think_completion_length": 46.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.953125, + "epoch": 1.7976391231028668, + "grad_norm": 4.704587642281763, + "kl": 0.478515625, + "learning_rate": 6.408094435075884e-07, + "loss": 0.0005, + "reward": 3.2085570096969604, + "reward_std": 0.2231890894472599, + "rewards/final_reward": 1.7234846949920737, + "rewards/mask_iou_reward": 0.8617423474960368, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2085569500923157, + "rewards/thk_ans_format_reward": 1.0, + "step": 1065, + "think_completion_length": 50.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.328125, + "epoch": 1.7993254637436762, + "grad_norm": 7.694097018347998, + "kl": 0.513671875, + "learning_rate": 6.404721753794266e-07, + "loss": 0.0005, + "reward": 2.546392798423767, + "reward_std": 0.24064208567142487, + "rewards/final_reward": 0.6668356472255468, + "rewards/mask_iou_reward": 0.3334178236127734, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5463928133249283, + "rewards/thk_ans_format_reward": 1.0, + "step": 1066, + "think_completion_length": 42.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.171875, + "epoch": 1.8010118043844856, + "grad_norm": 15.129890320245305, + "kl": 0.4677734375, + "learning_rate": 6.401349072512647e-07, + "loss": 0.0005, + "reward": 3.544395089149475, + "reward_std": 0.04383156634867191, + "rewards/final_reward": 1.2383304283535561, + "rewards/mask_iou_reward": 0.6191652141767781, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5443950295448303, + "rewards/thk_ans_format_reward": 1.0, + "step": 1067, + "think_completion_length": 47.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.515625, + "epoch": 1.8026981450252952, + "grad_norm": 10.479996959556871, + "kl": 0.4609375, + "learning_rate": 6.397976391231028e-07, + "loss": 0.0005, + "reward": 3.6842243671417236, + "reward_std": 0.08089240174740553, + "rewards/final_reward": 1.6949048454045466, + "rewards/mask_iou_reward": 0.8474524227022733, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6842244267463684, + "rewards/thk_ans_format_reward": 1.0, + "step": 1068, + "think_completion_length": 53.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.671875, + "epoch": 1.8043844856661044, + "grad_norm": 87.39047841596138, + "kl": 0.453125, + "learning_rate": 6.39460370994941e-07, + "loss": 0.0005, + "reward": 3.355344772338867, + "reward_std": 0.19397838786244392, + "rewards/final_reward": 1.704898139101287, + "rewards/mask_iou_reward": 0.8524490695506435, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3553447723388672, + "rewards/thk_ans_format_reward": 1.0, + "step": 1069, + "think_completion_length": 43.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.359375, + "epoch": 1.806070826306914, + "grad_norm": 7.768601686887431, + "kl": 0.4921875, + "learning_rate": 6.391231028667791e-07, + "loss": 0.0005, + "reward": 2.9813655614852905, + "reward_std": 0.12172066420316696, + "rewards/final_reward": 0.9793375309635147, + "rewards/mask_iou_reward": 0.48966876548175736, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9813654273748398, + "rewards/thk_ans_format_reward": 1.0, + "step": 1070, + "think_completion_length": 48.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.40625, + "epoch": 1.8077571669477235, + "grad_norm": 11.418688751202348, + "kl": 0.49609375, + "learning_rate": 6.387858347386172e-07, + "loss": 0.0005, + "reward": 3.1727246046066284, + "reward_std": 0.2514045834541321, + "rewards/final_reward": 1.5329492331264845, + "rewards/mask_iou_reward": 0.7664746165632422, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1727246642112732, + "rewards/thk_ans_format_reward": 1.0, + "step": 1071, + "think_completion_length": 39.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.953125, + "epoch": 1.809443507588533, + "grad_norm": 7.214437740443753, + "kl": 0.630859375, + "learning_rate": 6.384485666104554e-07, + "loss": 0.0006, + "reward": 3.3107919692993164, + "reward_std": 0.15224889293313026, + "rewards/final_reward": 0.9352677655935655, + "rewards/mask_iou_reward": 0.46763388279678275, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3107921481132507, + "rewards/thk_ans_format_reward": 1.0, + "step": 1072, + "think_completion_length": 48.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.125, + "epoch": 1.8111298482293423, + "grad_norm": 17.597183249832156, + "kl": 0.4599609375, + "learning_rate": 6.381112984822933e-07, + "loss": 0.0005, + "reward": 3.603898763656616, + "reward_std": 0.1738036908209324, + "rewards/final_reward": 1.9033266308019536, + "rewards/mask_iou_reward": 0.9516633154009768, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6038986444473267, + "rewards/thk_ans_format_reward": 1.0, + "step": 1073, + "think_completion_length": 52.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.375, + "epoch": 1.8128161888701517, + "grad_norm": 8.705159581822338, + "kl": 0.494140625, + "learning_rate": 6.377740303541314e-07, + "loss": 0.0005, + "reward": 3.3796963691711426, + "reward_std": 0.16491149365901947, + "rewards/final_reward": 1.393316190307897, + "rewards/mask_iou_reward": 0.6966580951539485, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.379696547985077, + "rewards/thk_ans_format_reward": 1.0, + "step": 1074, + "think_completion_length": 44.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.828125, + "epoch": 1.8145025295109614, + "grad_norm": 31.20759043501845, + "kl": 0.451171875, + "learning_rate": 6.374367622259696e-07, + "loss": 0.0005, + "reward": 3.3295449018478394, + "reward_std": 0.12656425312161446, + "rewards/final_reward": 1.7795149105856936, + "rewards/mask_iou_reward": 0.8897574552928468, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3295449018478394, + "rewards/thk_ans_format_reward": 1.0, + "step": 1075, + "think_completion_length": 42.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.703125, + "epoch": 1.8161888701517706, + "grad_norm": 5.87686245738229, + "kl": 0.5361328125, + "learning_rate": 6.370994940978077e-07, + "loss": 0.0005, + "reward": 3.0567972660064697, + "reward_std": 0.2288198471069336, + "rewards/final_reward": 1.0152084192726702, + "rewards/mask_iou_reward": 0.5076042096363351, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0567971467971802, + "rewards/thk_ans_format_reward": 1.0, + "step": 1076, + "think_completion_length": 38.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.6875, + "epoch": 1.8178752107925802, + "grad_norm": 15.31134271285599, + "kl": 0.486328125, + "learning_rate": 6.367622259696458e-07, + "loss": 0.0005, + "reward": 3.3816200494766235, + "reward_std": 0.0680837333202362, + "rewards/final_reward": 1.6581806400046022, + "rewards/mask_iou_reward": 0.8290903200023011, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3816199898719788, + "rewards/thk_ans_format_reward": 1.0, + "step": 1077, + "think_completion_length": 47.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.125, + "epoch": 1.8195615514333894, + "grad_norm": 39.16886392147183, + "kl": 0.484375, + "learning_rate": 6.36424957841484e-07, + "loss": 0.0005, + "reward": 3.5349349975585938, + "reward_std": 0.10329584777355194, + "rewards/final_reward": 1.5334088335683878, + "rewards/mask_iou_reward": 0.7667044167841939, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5349348783493042, + "rewards/thk_ans_format_reward": 1.0, + "step": 1078, + "think_completion_length": 41.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.5, + "epoch": 1.821247892074199, + "grad_norm": 7.611256628332114, + "kl": 0.4736328125, + "learning_rate": 6.360876897133221e-07, + "loss": 0.0005, + "reward": 3.3147194385528564, + "reward_std": 0.074610386043787, + "rewards/final_reward": 1.6858395633390624, + "rewards/mask_iou_reward": 0.8429197816695312, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3147195279598236, + "rewards/thk_ans_format_reward": 1.0, + "step": 1079, + "think_completion_length": 48.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.0, + "epoch": 1.8229342327150084, + "grad_norm": 12.382623549496923, + "kl": 0.509765625, + "learning_rate": 6.357504215851602e-07, + "loss": 0.0005, + "reward": 2.8961130380630493, + "reward_std": 0.24354761838912964, + "rewards/final_reward": 0.6990347408050159, + "rewards/mask_iou_reward": 0.34951737040250797, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8961129784584045, + "rewards/thk_ans_format_reward": 1.0, + "step": 1080, + "think_completion_length": 42.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.640625, + "epoch": 1.8246205733558178, + "grad_norm": 12.439318435799716, + "kl": 0.5390625, + "learning_rate": 6.354131534569983e-07, + "loss": 0.0005, + "reward": 3.2025067806243896, + "reward_std": 0.10669799149036407, + "rewards/final_reward": 1.0149730957567362, + "rewards/mask_iou_reward": 0.5074865478783681, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.202506572008133, + "rewards/thk_ans_format_reward": 1.0, + "step": 1081, + "think_completion_length": 45.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.4375, + "epoch": 1.8263069139966275, + "grad_norm": 6.201103280303299, + "kl": 0.55078125, + "learning_rate": 6.350758853288363e-07, + "loss": 0.0006, + "reward": 3.2389400005340576, + "reward_std": 0.06579168047755957, + "rewards/final_reward": 1.4973005004889282, + "rewards/mask_iou_reward": 0.7486502502444641, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2389401197433472, + "rewards/thk_ans_format_reward": 1.0, + "step": 1082, + "think_completion_length": 63.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.59375, + "epoch": 1.8279932546374367, + "grad_norm": 11.23582946892846, + "kl": 0.466796875, + "learning_rate": 6.347386172006744e-07, + "loss": 0.0005, + "reward": 3.2684438228607178, + "reward_std": 0.19575618207454681, + "rewards/final_reward": 1.6867246645460408, + "rewards/mask_iou_reward": 0.8433623322730204, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.268443763256073, + "rewards/thk_ans_format_reward": 1.0, + "step": 1083, + "think_completion_length": 40.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.765625, + "epoch": 1.8296795952782463, + "grad_norm": 12.126812671752475, + "kl": 0.654296875, + "learning_rate": 6.344013490725126e-07, + "loss": 0.0007, + "reward": 3.1550729274749756, + "reward_std": 0.5026094168424606, + "rewards/final_reward": 1.0971722753586644, + "rewards/mask_iou_reward": 0.5485861376793322, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1550728678703308, + "rewards/thk_ans_format_reward": 1.0, + "step": 1084, + "think_completion_length": 49.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.578125, + "epoch": 1.8313659359190555, + "grad_norm": 18.21812835633944, + "kl": 0.4580078125, + "learning_rate": 6.340640809443507e-07, + "loss": 0.0005, + "reward": 2.624867796897888, + "reward_std": 0.22860441729426384, + "rewards/final_reward": 0.8603130411544049, + "rewards/mask_iou_reward": 0.43015652057720244, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.6404927968978882, + "rewards/thk_ans_format_reward": 1.0, + "step": 1085, + "think_completion_length": 42.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.890625, + "epoch": 1.8330522765598651, + "grad_norm": 5.478540155142122, + "kl": 0.5126953125, + "learning_rate": 6.337268128161889e-07, + "loss": 0.0005, + "reward": 3.58796763420105, + "reward_std": 0.012379450490698218, + "rewards/final_reward": 1.3301058680886824, + "rewards/mask_iou_reward": 0.6650529340443412, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.587967574596405, + "rewards/thk_ans_format_reward": 1.0, + "step": 1086, + "think_completion_length": 42.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.515625, + "epoch": 1.8347386172006745, + "grad_norm": 13.300127679236585, + "kl": 0.6044921875, + "learning_rate": 6.33389544688027e-07, + "loss": 0.0006, + "reward": 3.036729335784912, + "reward_std": 0.054497267585247755, + "rewards/final_reward": 0.6875000276307961, + "rewards/mask_iou_reward": 0.34375001381539805, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0367292761802673, + "rewards/thk_ans_format_reward": 1.0, + "step": 1087, + "think_completion_length": 45.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.328125, + "epoch": 1.836424957841484, + "grad_norm": 7.887733188962814, + "kl": 0.46484375, + "learning_rate": 6.330522765598651e-07, + "loss": 0.0005, + "reward": 2.800957202911377, + "reward_std": 0.4332638531923294, + "rewards/final_reward": 1.2232966038997053, + "rewards/mask_iou_reward": 0.6116483019498526, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8009572625160217, + "rewards/thk_ans_format_reward": 1.0, + "step": 1088, + "think_completion_length": 43.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.078125, + "epoch": 1.8381112984822934, + "grad_norm": 21.665998281230607, + "kl": 0.482421875, + "learning_rate": 6.327150084317033e-07, + "loss": 0.0005, + "reward": 3.037529468536377, + "reward_std": 0.2697841115295887, + "rewards/final_reward": 0.8968575313778724, + "rewards/mask_iou_reward": 0.4484287656889362, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.037529468536377, + "rewards/thk_ans_format_reward": 1.0, + "step": 1089, + "think_completion_length": 44.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.28125, + "epoch": 1.8397976391231028, + "grad_norm": 13.069209866194338, + "kl": 0.4541015625, + "learning_rate": 6.323777403035413e-07, + "loss": 0.0005, + "reward": 3.4228765964508057, + "reward_std": 0.2066943645477295, + "rewards/final_reward": 1.433857315425791, + "rewards/mask_iou_reward": 0.7169286577128955, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4228765368461609, + "rewards/thk_ans_format_reward": 1.0, + "step": 1090, + "think_completion_length": 47.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.21875, + "epoch": 1.8414839797639124, + "grad_norm": 6.749037357878424, + "kl": 0.4404296875, + "learning_rate": 6.320404721753793e-07, + "loss": 0.0004, + "reward": 2.5815987586975098, + "reward_std": 0.1750339277787134, + "rewards/final_reward": 0.39387072797870026, + "rewards/mask_iou_reward": 0.19693536398935013, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5815986543893814, + "rewards/thk_ans_format_reward": 1.0, + "step": 1091, + "think_completion_length": 50.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.1875, + "epoch": 1.8431703204047216, + "grad_norm": 19.693867653877927, + "kl": 0.484375, + "learning_rate": 6.317032040472175e-07, + "loss": 0.0005, + "reward": 3.2191600799560547, + "reward_std": 0.34185342490673065, + "rewards/final_reward": 1.082252412484555, + "rewards/mask_iou_reward": 0.5411262062422775, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.219160258769989, + "rewards/thk_ans_format_reward": 1.0, + "step": 1092, + "think_completion_length": 43.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.921875, + "epoch": 1.8448566610455313, + "grad_norm": 8.681066579359914, + "kl": 0.50390625, + "learning_rate": 6.313659359190556e-07, + "loss": 0.0005, + "reward": 3.290645480155945, + "reward_std": 0.11531023494899273, + "rewards/final_reward": 0.7578482514866762, + "rewards/mask_iou_reward": 0.3789241257433381, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2906455397605896, + "rewards/thk_ans_format_reward": 1.0, + "step": 1093, + "think_completion_length": 52.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.09375, + "epoch": 1.8465430016863407, + "grad_norm": 14.19594490203886, + "kl": 0.501953125, + "learning_rate": 6.310286677908937e-07, + "loss": 0.0005, + "reward": 2.789927124977112, + "reward_std": 0.1311767096631229, + "rewards/final_reward": 1.042739142866941, + "rewards/mask_iou_reward": 0.5213695714334705, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7899271547794342, + "rewards/thk_ans_format_reward": 1.0, + "step": 1094, + "think_completion_length": 49.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.046875, + "epoch": 1.84822934232715, + "grad_norm": 7.918474446662982, + "kl": 0.44921875, + "learning_rate": 6.306913996627319e-07, + "loss": 0.0005, + "reward": 3.8491674661636353, + "reward_std": 0.022587507497519255, + "rewards/final_reward": 1.8873137668664528, + "rewards/mask_iou_reward": 0.9436568834332264, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8491673469543457, + "rewards/thk_ans_format_reward": 1.0, + "step": 1095, + "think_completion_length": 46.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.65625, + "epoch": 1.8499156829679595, + "grad_norm": 8.515437657907258, + "kl": 0.62109375, + "learning_rate": 6.3035413153457e-07, + "loss": 0.0006, + "reward": 3.223744511604309, + "reward_std": 0.384935200214386, + "rewards/final_reward": 1.5109505036988033, + "rewards/mask_iou_reward": 0.7554752518494017, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2237444519996643, + "rewards/thk_ans_format_reward": 1.0, + "step": 1096, + "think_completion_length": 53.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.078125, + "epoch": 1.851602023608769, + "grad_norm": 7.834563260850435, + "kl": 0.57421875, + "learning_rate": 6.300168634064081e-07, + "loss": 0.0006, + "reward": 2.8379101753234863, + "reward_std": 0.19312208145856857, + "rewards/final_reward": 0.766568190172088, + "rewards/mask_iou_reward": 0.383284095086044, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8379101008176804, + "rewards/thk_ans_format_reward": 1.0, + "step": 1097, + "think_completion_length": 48.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.15625, + "epoch": 1.8532883642495785, + "grad_norm": 14.297124589618678, + "kl": 0.4443359375, + "learning_rate": 6.296795952782462e-07, + "loss": 0.0004, + "reward": 3.3441884517669678, + "reward_std": 0.03978629596531391, + "rewards/final_reward": 0.8399567347734211, + "rewards/mask_iou_reward": 0.41997836738671057, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.344188630580902, + "rewards/thk_ans_format_reward": 1.0, + "step": 1098, + "think_completion_length": 51.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.6875, + "epoch": 1.8549747048903877, + "grad_norm": 9.448604353193767, + "kl": 0.818359375, + "learning_rate": 6.293423271500843e-07, + "loss": 0.0008, + "reward": 2.577287793159485, + "reward_std": 0.10695656202733517, + "rewards/final_reward": 0.17409313455119524, + "rewards/mask_iou_reward": 0.08704656727559762, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5772877894341946, + "rewards/thk_ans_format_reward": 1.0, + "step": 1099, + "think_completion_length": 51.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.5, + "epoch": 1.8566610455311974, + "grad_norm": 41.754579974458586, + "kl": 0.501953125, + "learning_rate": 6.290050590219223e-07, + "loss": 0.0005, + "reward": 3.414603114128113, + "reward_std": 0.07115489459829405, + "rewards/final_reward": 1.2164036374620517, + "rewards/mask_iou_reward": 0.6082018187310259, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4146031737327576, + "rewards/thk_ans_format_reward": 1.0, + "step": 1100, + "think_completion_length": 50.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.8125, + "epoch": 1.8583473861720068, + "grad_norm": 21.23442828983993, + "kl": 0.611328125, + "learning_rate": 6.286677908937605e-07, + "loss": 0.0006, + "reward": 3.3198102712631226, + "reward_std": 0.045665791258215904, + "rewards/final_reward": 1.6833926643806696, + "rewards/mask_iou_reward": 0.8416963321903348, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.319810390472412, + "rewards/thk_ans_format_reward": 1.0, + "step": 1101, + "think_completion_length": 42.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.1875, + "epoch": 1.8600337268128162, + "grad_norm": 7.060350179958177, + "kl": 0.486328125, + "learning_rate": 6.283305227655986e-07, + "loss": 0.0005, + "reward": 3.4817529916763306, + "reward_std": 0.057911899872124195, + "rewards/final_reward": 1.4049579396114342, + "rewards/mask_iou_reward": 0.7024789698057171, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4817529320716858, + "rewards/thk_ans_format_reward": 1.0, + "step": 1102, + "think_completion_length": 48.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.78125, + "epoch": 1.8617200674536256, + "grad_norm": 10.113922661906768, + "kl": 0.4765625, + "learning_rate": 6.279932546374367e-07, + "loss": 0.0005, + "reward": 2.7638001441955566, + "reward_std": 0.13444151729345322, + "rewards/final_reward": 0.11543181604880562, + "rewards/mask_iou_reward": 0.05771590802440281, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7638000845909119, + "rewards/thk_ans_format_reward": 1.0, + "step": 1103, + "think_completion_length": 48.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.578125, + "epoch": 1.863406408094435, + "grad_norm": 12.2379733263627, + "kl": 0.4501953125, + "learning_rate": 6.276559865092749e-07, + "loss": 0.0004, + "reward": 2.952306628227234, + "reward_std": 0.2903987839818001, + "rewards/final_reward": 0.2940298889414863, + "rewards/mask_iou_reward": 0.14701494447074315, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9523066282272339, + "rewards/thk_ans_format_reward": 1.0, + "step": 1104, + "think_completion_length": 63.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.78125, + "epoch": 1.8650927487352447, + "grad_norm": 7.307629859355944, + "kl": 0.458984375, + "learning_rate": 6.27318718381113e-07, + "loss": 0.0004, + "reward": 3.6160776615142822, + "reward_std": 0.26231749448925257, + "rewards/final_reward": 1.918327592949087, + "rewards/mask_iou_reward": 0.9591637964745435, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6160775423049927, + "rewards/thk_ans_format_reward": 1.0, + "step": 1105, + "think_completion_length": 46.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.75, + "epoch": 1.8667790893760539, + "grad_norm": 8.120904059288417, + "kl": 0.4375, + "learning_rate": 6.26981450252951e-07, + "loss": 0.0005, + "reward": 2.6095887422561646, + "reward_std": 0.36206041276454926, + "rewards/final_reward": 0.18646807204091528, + "rewards/mask_iou_reward": 0.09323403602045764, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6095886826515198, + "rewards/thk_ans_format_reward": 1.0, + "step": 1106, + "think_completion_length": 55.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.96875, + "epoch": 1.8684654300168635, + "grad_norm": 4.949742663888367, + "kl": 0.4736328125, + "learning_rate": 6.266441821247892e-07, + "loss": 0.0005, + "reward": 3.171900749206543, + "reward_std": 0.1200435683131218, + "rewards/final_reward": 1.7301323434279121, + "rewards/mask_iou_reward": 0.8650661717139561, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1719006299972534, + "rewards/thk_ans_format_reward": 1.0, + "step": 1107, + "think_completion_length": 49.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.140625, + "epoch": 1.8701517706576727, + "grad_norm": 6.932244002142069, + "kl": 0.501953125, + "learning_rate": 6.263069139966273e-07, + "loss": 0.0005, + "reward": 2.9779754877090454, + "reward_std": 0.1466265469789505, + "rewards/final_reward": 1.129309473098064, + "rewards/mask_iou_reward": 0.564654736549032, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9779754877090454, + "rewards/thk_ans_format_reward": 1.0, + "step": 1108, + "think_completion_length": 42.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.140625, + "epoch": 1.8718381112984823, + "grad_norm": 7.343964169963569, + "kl": 0.4931640625, + "learning_rate": 6.259696458684654e-07, + "loss": 0.0005, + "reward": 2.8175435066223145, + "reward_std": 0.016309996135532856, + "rewards/final_reward": 0.8358996803031749, + "rewards/mask_iou_reward": 0.4179498401515874, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8175435662269592, + "rewards/thk_ans_format_reward": 1.0, + "step": 1109, + "think_completion_length": 48.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.609375, + "epoch": 1.8735244519392917, + "grad_norm": 5.530330908664622, + "kl": 0.505859375, + "learning_rate": 6.256323777403035e-07, + "loss": 0.0005, + "reward": 3.406356692314148, + "reward_std": 0.13961811736226082, + "rewards/final_reward": 1.3290365300351592, + "rewards/mask_iou_reward": 0.6645182650175796, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4063568115234375, + "rewards/thk_ans_format_reward": 1.0, + "step": 1110, + "think_completion_length": 45.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.28125, + "epoch": 1.8752107925801011, + "grad_norm": 5.96310263404028, + "kl": 0.517578125, + "learning_rate": 6.252951096121416e-07, + "loss": 0.0005, + "reward": 3.4238619804382324, + "reward_std": 0.2606248203665018, + "rewards/final_reward": 1.360026099518807, + "rewards/mask_iou_reward": 0.6800130497594035, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4238619804382324, + "rewards/thk_ans_format_reward": 1.0, + "step": 1111, + "think_completion_length": 47.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.90625, + "epoch": 1.8768971332209108, + "grad_norm": 6.039639555921078, + "kl": 0.455078125, + "learning_rate": 6.249578414839798e-07, + "loss": 0.0005, + "reward": 3.1803025007247925, + "reward_std": 0.14678914099931717, + "rewards/final_reward": 0.7781353902486867, + "rewards/mask_iou_reward": 0.38906769512434336, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.18030247092247, + "rewards/thk_ans_format_reward": 1.0, + "step": 1112, + "think_completion_length": 54.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.59375, + "epoch": 1.87858347386172, + "grad_norm": 8.678069428182924, + "kl": 0.4697265625, + "learning_rate": 6.246205733558179e-07, + "loss": 0.0005, + "reward": 3.3460100889205933, + "reward_std": 0.2927638292312622, + "rewards/final_reward": 1.6371790768525196, + "rewards/mask_iou_reward": 0.8185895384262598, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.3616352081298828, + "rewards/thk_ans_format_reward": 1.0, + "step": 1113, + "think_completion_length": 58.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.15625, + "epoch": 1.8802698145025296, + "grad_norm": 19.248741631074427, + "kl": 0.462890625, + "learning_rate": 6.242833052276559e-07, + "loss": 0.0005, + "reward": 3.0415929555892944, + "reward_std": 0.15946677327156067, + "rewards/final_reward": 0.6804740707632364, + "rewards/mask_iou_reward": 0.3402370353816182, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0415929555892944, + "rewards/thk_ans_format_reward": 1.0, + "step": 1114, + "think_completion_length": 55.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.859375, + "epoch": 1.8819561551433388, + "grad_norm": 4.513737480234631, + "kl": 0.4755859375, + "learning_rate": 6.239460370994941e-07, + "loss": 0.0005, + "reward": 2.9178273677825928, + "reward_std": 0.13480617478489876, + "rewards/final_reward": 0.7582901537480667, + "rewards/mask_iou_reward": 0.37914507687403337, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9178274273872375, + "rewards/thk_ans_format_reward": 1.0, + "step": 1115, + "think_completion_length": 59.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.984375, + "epoch": 1.8836424957841484, + "grad_norm": 9.499820735106061, + "kl": 0.5, + "learning_rate": 6.236087689713322e-07, + "loss": 0.0005, + "reward": 3.2555822134017944, + "reward_std": 0.29104815423488617, + "rewards/final_reward": 1.06362789651073, + "rewards/mask_iou_reward": 0.531813948255365, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2555820941925049, + "rewards/thk_ans_format_reward": 1.0, + "step": 1116, + "think_completion_length": 57.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.015625, + "epoch": 1.8853288364249579, + "grad_norm": 14.094908197904413, + "kl": 0.47265625, + "learning_rate": 6.232715008431702e-07, + "loss": 0.0005, + "reward": 3.2906464338302612, + "reward_std": 0.14907943457365036, + "rewards/final_reward": 0.9073393636444084, + "rewards/mask_iou_reward": 0.4536696818222042, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2906464636325836, + "rewards/thk_ans_format_reward": 1.0, + "step": 1117, + "think_completion_length": 57.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.609375, + "epoch": 1.8870151770657673, + "grad_norm": 14.820800134042324, + "kl": 0.458984375, + "learning_rate": 6.229342327150084e-07, + "loss": 0.0005, + "reward": 3.3356423377990723, + "reward_std": 0.33301595598459244, + "rewards/final_reward": 1.497528428462273, + "rewards/mask_iou_reward": 0.7487642142311365, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3356422781944275, + "rewards/thk_ans_format_reward": 1.0, + "step": 1118, + "think_completion_length": 59.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.75, + "epoch": 1.8887015177065767, + "grad_norm": 3.7472328819560543, + "kl": 0.447265625, + "learning_rate": 6.225969645868465e-07, + "loss": 0.0004, + "reward": 3.4780514240264893, + "reward_std": 0.04463301133364439, + "rewards/final_reward": 1.4461199073916258, + "rewards/mask_iou_reward": 0.7230599536958129, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4780513644218445, + "rewards/thk_ans_format_reward": 1.0, + "step": 1119, + "think_completion_length": 49.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.25, + "epoch": 1.890387858347386, + "grad_norm": 5.568276911965404, + "kl": 0.474609375, + "learning_rate": 6.222596964586846e-07, + "loss": 0.0005, + "reward": 3.1224676370620728, + "reward_std": 0.19673360884189606, + "rewards/final_reward": 1.381617815496326, + "rewards/mask_iou_reward": 0.690808907748163, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.122467577457428, + "rewards/thk_ans_format_reward": 1.0, + "step": 1120, + "think_completion_length": 45.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.828125, + "epoch": 1.8920741989881957, + "grad_norm": 29.02279025388352, + "kl": 0.4521484375, + "learning_rate": 6.219224283305228e-07, + "loss": 0.0005, + "reward": 3.0229815244674683, + "reward_std": 0.2775159105658531, + "rewards/final_reward": 1.3215004490247066, + "rewards/mask_iou_reward": 0.6607502245123533, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0229815542697906, + "rewards/thk_ans_format_reward": 1.0, + "step": 1121, + "think_completion_length": 50.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.15625, + "epoch": 1.893760539629005, + "grad_norm": 8.50671484702284, + "kl": 0.4091796875, + "learning_rate": 6.215851602023609e-07, + "loss": 0.0004, + "reward": 3.070726156234741, + "reward_std": 0.4190548211336136, + "rewards/final_reward": 1.7271931964343437, + "rewards/mask_iou_reward": 0.8635965982171718, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0707260966300964, + "rewards/thk_ans_format_reward": 1.0, + "step": 1122, + "think_completion_length": 52.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.6875, + "epoch": 1.8954468802698146, + "grad_norm": 41.49957656934944, + "kl": 0.501953125, + "learning_rate": 6.212478920741989e-07, + "loss": 0.0005, + "reward": 3.3182214498519897, + "reward_std": 0.14836269989609718, + "rewards/final_reward": 1.350961792262883, + "rewards/mask_iou_reward": 0.6754808961314415, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3182214498519897, + "rewards/thk_ans_format_reward": 1.0, + "step": 1123, + "think_completion_length": 54.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.640625, + "epoch": 1.897133220910624, + "grad_norm": 6.733032689079996, + "kl": 0.4013671875, + "learning_rate": 6.209106239460371e-07, + "loss": 0.0004, + "reward": 3.5246121883392334, + "reward_std": 0.2807541564106941, + "rewards/final_reward": 1.4699369159761246, + "rewards/mask_iou_reward": 0.7349684579880623, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5246121883392334, + "rewards/thk_ans_format_reward": 1.0, + "step": 1124, + "think_completion_length": 50.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.609375, + "epoch": 1.8988195615514334, + "grad_norm": 12.621584714509966, + "kl": 0.4697265625, + "learning_rate": 6.205733558178752e-07, + "loss": 0.0005, + "reward": 3.1331878900527954, + "reward_std": 0.12203128053806722, + "rewards/final_reward": 0.42254711525684857, + "rewards/mask_iou_reward": 0.21127355762842429, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.1488128304481506, + "rewards/thk_ans_format_reward": 1.0, + "step": 1125, + "think_completion_length": 46.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.296875, + "epoch": 1.9005059021922428, + "grad_norm": 9.673146215837875, + "kl": 0.4873046875, + "learning_rate": 6.202360876897132e-07, + "loss": 0.0005, + "reward": 3.5485845804214478, + "reward_std": 0.05991579405963421, + "rewards/final_reward": 1.4075040653508908, + "rewards/mask_iou_reward": 0.7037520326754454, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.548584520816803, + "rewards/thk_ans_format_reward": 1.0, + "step": 1126, + "think_completion_length": 54.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.90625, + "epoch": 1.9021922428330522, + "grad_norm": 5.361577006546381, + "kl": 0.541015625, + "learning_rate": 6.198988195615514e-07, + "loss": 0.0005, + "reward": 3.0988335609436035, + "reward_std": 0.1889047771692276, + "rewards/final_reward": 1.1874666094459694, + "rewards/mask_iou_reward": 0.5937333047229847, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0988333821296692, + "rewards/thk_ans_format_reward": 1.0, + "step": 1127, + "think_completion_length": 54.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.125, + "epoch": 1.9038785834738619, + "grad_norm": 5.952423108624937, + "kl": 0.458984375, + "learning_rate": 6.195615514333895e-07, + "loss": 0.0005, + "reward": 3.1148312091827393, + "reward_std": 0.14229458943009377, + "rewards/final_reward": 1.3390039678383898, + "rewards/mask_iou_reward": 0.6695019839191949, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1148313283920288, + "rewards/thk_ans_format_reward": 1.0, + "step": 1128, + "think_completion_length": 63.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.546875, + "epoch": 1.905564924114671, + "grad_norm": 8.254122000550824, + "kl": 0.458984375, + "learning_rate": 6.192242833052276e-07, + "loss": 0.0005, + "reward": 2.7562084197998047, + "reward_std": 0.39115703105926514, + "rewards/final_reward": 0.8909723524188679, + "rewards/mask_iou_reward": 0.4454861762094339, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7562084496021271, + "rewards/thk_ans_format_reward": 1.0, + "step": 1129, + "think_completion_length": 55.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.21875, + "epoch": 1.9072512647554807, + "grad_norm": 8.147271330894098, + "kl": 0.447265625, + "learning_rate": 6.188870151770658e-07, + "loss": 0.0004, + "reward": 3.0163233280181885, + "reward_std": 0.12908070534467697, + "rewards/final_reward": 0.8853229324141032, + "rewards/mask_iou_reward": 0.4426614662070516, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.016323447227478, + "rewards/thk_ans_format_reward": 1.0, + "step": 1130, + "think_completion_length": 54.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.421875, + "epoch": 1.90893760539629, + "grad_norm": 14.783755350176113, + "kl": 0.4609375, + "learning_rate": 6.185497470489038e-07, + "loss": 0.0005, + "reward": 3.187541961669922, + "reward_std": 0.23147797584533691, + "rewards/final_reward": 1.5944909250270047, + "rewards/mask_iou_reward": 0.7972454625135024, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.187541902065277, + "rewards/thk_ans_format_reward": 1.0, + "step": 1131, + "think_completion_length": 52.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.484375, + "epoch": 1.9106239460370995, + "grad_norm": 8.047496221321932, + "kl": 0.4609375, + "learning_rate": 6.182124789207419e-07, + "loss": 0.0005, + "reward": 3.281162738800049, + "reward_std": 0.09780286997556686, + "rewards/final_reward": 0.9920333978301556, + "rewards/mask_iou_reward": 0.4960166989150778, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.281162679195404, + "rewards/thk_ans_format_reward": 1.0, + "step": 1132, + "think_completion_length": 45.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.890625, + "epoch": 1.912310286677909, + "grad_norm": 9.426326655888504, + "kl": 0.4375, + "learning_rate": 6.178752107925801e-07, + "loss": 0.0004, + "reward": 3.7058013677597046, + "reward_std": 0.16382237616926432, + "rewards/final_reward": 1.8595333019630167, + "rewards/mask_iou_reward": 0.9297666509815083, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7058013081550598, + "rewards/thk_ans_format_reward": 1.0, + "step": 1133, + "think_completion_length": 45.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.46875, + "epoch": 1.9139966273187183, + "grad_norm": 13.810244184431745, + "kl": 0.4287109375, + "learning_rate": 6.175379426644182e-07, + "loss": 0.0004, + "reward": 2.5829213857650757, + "reward_std": 0.23861295729875565, + "rewards/final_reward": 0.649385258994511, + "rewards/mask_iou_reward": 0.3246926294972555, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5829213559627533, + "rewards/thk_ans_format_reward": 1.0, + "step": 1134, + "think_completion_length": 52.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.546875, + "epoch": 1.915682967959528, + "grad_norm": 7.353119304271834, + "kl": 0.4150390625, + "learning_rate": 6.172006745362564e-07, + "loss": 0.0004, + "reward": 3.28650164604187, + "reward_std": 0.1776575818657875, + "rewards/final_reward": 1.434402230781888, + "rewards/mask_iou_reward": 0.717201115390944, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2865016460418701, + "rewards/thk_ans_format_reward": 1.0, + "step": 1135, + "think_completion_length": 48.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.21875, + "epoch": 1.9173693086003372, + "grad_norm": 9.605920969673068, + "kl": 0.408203125, + "learning_rate": 6.168634064080944e-07, + "loss": 0.0004, + "reward": 3.2172775268554688, + "reward_std": 0.1914580576121807, + "rewards/final_reward": 1.293543048601166, + "rewards/mask_iou_reward": 0.646771524300583, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2172774970531464, + "rewards/thk_ans_format_reward": 1.0, + "step": 1136, + "think_completion_length": 55.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.46875, + "epoch": 1.9190556492411468, + "grad_norm": 5.6346964134655675, + "kl": 0.591796875, + "learning_rate": 6.165261382799325e-07, + "loss": 0.0006, + "reward": 3.438536763191223, + "reward_std": 0.18300874158740044, + "rewards/final_reward": 1.463139205462022, + "rewards/mask_iou_reward": 0.731569602731011, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4385367631912231, + "rewards/thk_ans_format_reward": 1.0, + "step": 1137, + "think_completion_length": 53.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.4375, + "epoch": 1.920741989881956, + "grad_norm": 8.352522402689223, + "kl": 0.513671875, + "learning_rate": 6.161888701517707e-07, + "loss": 0.0005, + "reward": 3.0348947048187256, + "reward_std": 0.09543421119451523, + "rewards/final_reward": 0.6401037795648319, + "rewards/mask_iou_reward": 0.32005188978241594, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0348947048187256, + "rewards/thk_ans_format_reward": 1.0, + "step": 1138, + "think_completion_length": 54.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.15625, + "epoch": 1.9224283305227656, + "grad_norm": 12.679459677972318, + "kl": 0.400390625, + "learning_rate": 6.158516020236087e-07, + "loss": 0.0004, + "reward": 2.949214816093445, + "reward_std": 0.2778293192386627, + "rewards/final_reward": 1.1230542828676453, + "rewards/mask_iou_reward": 0.5615271414338227, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9492146372795105, + "rewards/thk_ans_format_reward": 1.0, + "step": 1139, + "think_completion_length": 50.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.546875, + "epoch": 1.924114671163575, + "grad_norm": 8.364381572643524, + "kl": 0.42578125, + "learning_rate": 6.155143338954468e-07, + "loss": 0.0004, + "reward": 3.144506812095642, + "reward_std": 0.08507668972015381, + "rewards/final_reward": 0.7166029162035097, + "rewards/mask_iou_reward": 0.35830145810175484, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1445067524909973, + "rewards/thk_ans_format_reward": 1.0, + "step": 1140, + "think_completion_length": 55.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.734375, + "epoch": 1.9258010118043845, + "grad_norm": 122.5712585070957, + "kl": 0.4560546875, + "learning_rate": 6.15177065767285e-07, + "loss": 0.0005, + "reward": 2.9750025272369385, + "reward_std": 0.1318796332925558, + "rewards/final_reward": 0.9958614977067766, + "rewards/mask_iou_reward": 0.4979307488533883, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9750024378299713, + "rewards/thk_ans_format_reward": 1.0, + "step": 1141, + "think_completion_length": 54.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.828125, + "epoch": 1.927487352445194, + "grad_norm": 17.82613440604544, + "kl": 0.458984375, + "learning_rate": 6.148397976391231e-07, + "loss": 0.0005, + "reward": 3.3588626384735107, + "reward_std": 0.18949565291404724, + "rewards/final_reward": 1.3248426813549425, + "rewards/mask_iou_reward": 0.6624213406774713, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.358862578868866, + "rewards/thk_ans_format_reward": 1.0, + "step": 1142, + "think_completion_length": 51.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.546875, + "epoch": 1.9291736930860033, + "grad_norm": 90.73018968754026, + "kl": 0.4677734375, + "learning_rate": 6.145025295109612e-07, + "loss": 0.0005, + "reward": 3.493720531463623, + "reward_std": 0.12904378399252892, + "rewards/final_reward": 1.0748831457430095, + "rewards/mask_iou_reward": 0.5374415728715047, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.493720293045044, + "rewards/thk_ans_format_reward": 1.0, + "step": 1143, + "think_completion_length": 50.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.734375, + "epoch": 1.930860033726813, + "grad_norm": 7.788511637518061, + "kl": 0.4443359375, + "learning_rate": 6.141652613827993e-07, + "loss": 0.0005, + "reward": 3.612895131111145, + "reward_std": 0.06411353871226311, + "rewards/final_reward": 1.4882109120853437, + "rewards/mask_iou_reward": 0.7441054560426719, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6128951907157898, + "rewards/thk_ans_format_reward": 1.0, + "step": 1144, + "think_completion_length": 50.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.859375, + "epoch": 1.932546374367622, + "grad_norm": 18.964191292290973, + "kl": 0.45703125, + "learning_rate": 6.138279932546374e-07, + "loss": 0.0005, + "reward": 3.2941290140151978, + "reward_std": 0.17387644201517105, + "rewards/final_reward": 0.8750820196472904, + "rewards/mask_iou_reward": 0.4375410098236452, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2941290438175201, + "rewards/thk_ans_format_reward": 1.0, + "step": 1145, + "think_completion_length": 56.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.015625, + "epoch": 1.9342327150084317, + "grad_norm": 5.3943910580571774, + "kl": 0.4775390625, + "learning_rate": 6.134907251264755e-07, + "loss": 0.0005, + "reward": 3.3576735258102417, + "reward_std": 0.27631305903196335, + "rewards/final_reward": 1.9525827807939062, + "rewards/mask_iou_reward": 0.9762913903969531, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3576735258102417, + "rewards/thk_ans_format_reward": 1.0, + "step": 1146, + "think_completion_length": 50.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.078125, + "epoch": 1.9359190556492412, + "grad_norm": 5.605297528578439, + "kl": 0.53125, + "learning_rate": 6.131534569983137e-07, + "loss": 0.0005, + "reward": 3.4034253358840942, + "reward_std": 0.3214539512991905, + "rewards/final_reward": 1.538946388924106, + "rewards/mask_iou_reward": 0.769473194462053, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4034252166748047, + "rewards/thk_ans_format_reward": 1.0, + "step": 1147, + "think_completion_length": 52.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.625, + "epoch": 1.9376053962900506, + "grad_norm": 11.819715756456931, + "kl": 0.4873046875, + "learning_rate": 6.128161888701517e-07, + "loss": 0.0005, + "reward": 3.4282305240631104, + "reward_std": 0.24819158017635345, + "rewards/final_reward": 1.0713962238537234, + "rewards/mask_iou_reward": 0.5356981119268617, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4282305240631104, + "rewards/thk_ans_format_reward": 1.0, + "step": 1148, + "think_completion_length": 47.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.265625, + "epoch": 1.93929173693086, + "grad_norm": 13.390608371093085, + "kl": 0.4345703125, + "learning_rate": 6.124789207419898e-07, + "loss": 0.0004, + "reward": 3.344928026199341, + "reward_std": 0.26076687313616276, + "rewards/final_reward": 1.62952084881511, + "rewards/mask_iou_reward": 0.814760424407555, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.3761780261993408, + "rewards/thk_ans_format_reward": 0.984375, + "step": 1149, + "think_completion_length": 47.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.0, + "epoch": 1.9409780775716694, + "grad_norm": 11.501195267830298, + "kl": 0.4384765625, + "learning_rate": 6.12141652613828e-07, + "loss": 0.0004, + "reward": 3.365402102470398, + "reward_std": 0.43733248114585876, + "rewards/final_reward": 1.8439775256589257, + "rewards/mask_iou_reward": 0.9219887628294628, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3654021620750427, + "rewards/thk_ans_format_reward": 1.0, + "step": 1150, + "think_completion_length": 48.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.71875, + "epoch": 1.942664418212479, + "grad_norm": 9.338780992051273, + "kl": 0.484375, + "learning_rate": 6.118043844856661e-07, + "loss": 0.0005, + "reward": 3.5329582691192627, + "reward_std": 0.04067577584646642, + "rewards/final_reward": 1.7367798965568824, + "rewards/mask_iou_reward": 0.8683899482784412, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5329583883285522, + "rewards/thk_ans_format_reward": 1.0, + "step": 1151, + "think_completion_length": 48.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.640625, + "epoch": 1.9443507588532882, + "grad_norm": 4.906728729430427, + "kl": 0.46484375, + "learning_rate": 6.114671163575042e-07, + "loss": 0.0004, + "reward": 3.2201439142227173, + "reward_std": 0.1605071723461151, + "rewards/final_reward": 1.255439579437334, + "rewards/mask_iou_reward": 0.627719789718667, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2201440632343292, + "rewards/thk_ans_format_reward": 1.0, + "step": 1152, + "think_completion_length": 44.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.171875, + "epoch": 1.9460370994940979, + "grad_norm": 8.460179438461045, + "kl": 0.5126953125, + "learning_rate": 6.111298482293423e-07, + "loss": 0.0005, + "reward": 3.0460928678512573, + "reward_std": 0.18447109311819077, + "rewards/final_reward": 0.9994140920300079, + "rewards/mask_iou_reward": 0.49970704601500393, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0460927784442902, + "rewards/thk_ans_format_reward": 1.0, + "step": 1153, + "think_completion_length": 55.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.625, + "epoch": 1.9477234401349073, + "grad_norm": 7.523583427820683, + "kl": 0.501953125, + "learning_rate": 6.107925801011804e-07, + "loss": 0.0005, + "reward": 3.3284194469451904, + "reward_std": 0.32087790966033936, + "rewards/final_reward": 1.168077053645009, + "rewards/mask_iou_reward": 0.5840385268225045, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3284194469451904, + "rewards/thk_ans_format_reward": 1.0, + "step": 1154, + "think_completion_length": 55.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.71875, + "epoch": 1.9494097807757167, + "grad_norm": 6.853582812271886, + "kl": 0.474609375, + "learning_rate": 6.104553119730185e-07, + "loss": 0.0005, + "reward": 3.2801350355148315, + "reward_std": 0.148701723664999, + "rewards/final_reward": 1.2226313129202175, + "rewards/mask_iou_reward": 0.6113156564601088, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.280134916305542, + "rewards/thk_ans_format_reward": 1.0, + "step": 1155, + "think_completion_length": 46.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.046875, + "epoch": 1.951096121416526, + "grad_norm": 5.466248056159601, + "kl": 0.458984375, + "learning_rate": 6.101180438448566e-07, + "loss": 0.0005, + "reward": 2.5636887550354004, + "reward_std": 0.048969279043376446, + "rewards/final_reward": 0.46655739556382414, + "rewards/mask_iou_reward": 0.23327869778191207, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5636887392029166, + "rewards/thk_ans_format_reward": 1.0, + "step": 1156, + "think_completion_length": 49.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.171875, + "epoch": 1.9527824620573355, + "grad_norm": 28.18428620248401, + "kl": 0.4375, + "learning_rate": 6.097807757166947e-07, + "loss": 0.0004, + "reward": 3.68328320980072, + "reward_std": 0.29689711332321167, + "rewards/final_reward": 1.7442000166514389, + "rewards/mask_iou_reward": 0.8721000083257194, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6832832098007202, + "rewards/thk_ans_format_reward": 1.0, + "step": 1157, + "think_completion_length": 49.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.953125, + "epoch": 1.9544688026981452, + "grad_norm": 6.4102496244791345, + "kl": 0.482421875, + "learning_rate": 6.094435075885328e-07, + "loss": 0.0005, + "reward": 3.239464044570923, + "reward_std": 0.1461598314344883, + "rewards/final_reward": 1.5674991248390242, + "rewards/mask_iou_reward": 0.7837495624195121, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2394639253616333, + "rewards/thk_ans_format_reward": 1.0, + "step": 1158, + "think_completion_length": 49.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.625, + "epoch": 1.9561551433389543, + "grad_norm": 25.05812311249102, + "kl": 0.470703125, + "learning_rate": 6.09106239460371e-07, + "loss": 0.0005, + "reward": 3.1786834001541138, + "reward_std": 0.06434584688395262, + "rewards/final_reward": 0.6210890595120667, + "rewards/mask_iou_reward": 0.31054452975603336, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1786834001541138, + "rewards/thk_ans_format_reward": 1.0, + "step": 1159, + "think_completion_length": 43.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.8125, + "epoch": 1.957841483979764, + "grad_norm": 26.698160368369376, + "kl": 0.49609375, + "learning_rate": 6.087689713322091e-07, + "loss": 0.0005, + "reward": 3.458927035331726, + "reward_std": 0.11408116295933723, + "rewards/final_reward": 1.2309284164893097, + "rewards/mask_iou_reward": 0.6154642082446549, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4589269161224365, + "rewards/thk_ans_format_reward": 1.0, + "step": 1160, + "think_completion_length": 53.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.453125, + "epoch": 1.9595278246205734, + "grad_norm": 9.673867972324873, + "kl": 0.46875, + "learning_rate": 6.084317032040473e-07, + "loss": 0.0005, + "reward": 3.612801671028137, + "reward_std": 0.14205888658761978, + "rewards/final_reward": 1.7749510858760662, + "rewards/mask_iou_reward": 0.8874755429380331, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6128017902374268, + "rewards/thk_ans_format_reward": 1.0, + "step": 1161, + "think_completion_length": 49.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.296875, + "epoch": 1.9612141652613828, + "grad_norm": 5.787770039930185, + "kl": 0.4521484375, + "learning_rate": 6.080944350758853e-07, + "loss": 0.0004, + "reward": 3.0567870140075684, + "reward_std": 0.25405317917466164, + "rewards/final_reward": 1.1140749716155698, + "rewards/mask_iou_reward": 0.5570374858077849, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0567870736122131, + "rewards/thk_ans_format_reward": 1.0, + "step": 1162, + "think_completion_length": 52.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.671875, + "epoch": 1.9629005059021922, + "grad_norm": 9.320319779187539, + "kl": 0.53125, + "learning_rate": 6.077571669477234e-07, + "loss": 0.0005, + "reward": 3.3574907779693604, + "reward_std": 0.14523665606975555, + "rewards/final_reward": 1.5528862513209964, + "rewards/mask_iou_reward": 0.7764431256604982, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3574907779693604, + "rewards/thk_ans_format_reward": 1.0, + "step": 1163, + "think_completion_length": 51.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.65625, + "epoch": 1.9645868465430016, + "grad_norm": 11.481249074007033, + "kl": 0.50390625, + "learning_rate": 6.074198988195615e-07, + "loss": 0.0005, + "reward": 3.671083688735962, + "reward_std": 0.1421994436532259, + "rewards/final_reward": 1.7619791945670902, + "rewards/mask_iou_reward": 0.8809895972835451, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6710836291313171, + "rewards/thk_ans_format_reward": 1.0, + "step": 1164, + "think_completion_length": 43.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.96875, + "epoch": 1.9662731871838113, + "grad_norm": 10.034131798115844, + "kl": 0.478515625, + "learning_rate": 6.070826306913996e-07, + "loss": 0.0005, + "reward": 3.3944214582443237, + "reward_std": 0.1728556640446186, + "rewards/final_reward": 1.7270548646100559, + "rewards/mask_iou_reward": 0.8635274323050279, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3944214880466461, + "rewards/thk_ans_format_reward": 1.0, + "step": 1165, + "think_completion_length": 46.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.203125, + "epoch": 1.9679595278246205, + "grad_norm": 8.306976055936005, + "kl": 0.494140625, + "learning_rate": 6.067453625632377e-07, + "loss": 0.0005, + "reward": 3.1458224058151245, + "reward_std": 0.10424304194748402, + "rewards/final_reward": 1.2310617441756069, + "rewards/mask_iou_reward": 0.6155308720878034, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.145822286605835, + "rewards/thk_ans_format_reward": 1.0, + "step": 1166, + "think_completion_length": 49.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.671875, + "epoch": 1.96964586846543, + "grad_norm": 6.85253908949388, + "kl": 0.484375, + "learning_rate": 6.064080944350759e-07, + "loss": 0.0005, + "reward": 3.0486035346984863, + "reward_std": 0.2735901027917862, + "rewards/final_reward": 0.955588583381466, + "rewards/mask_iou_reward": 0.477794291690733, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0486035346984863, + "rewards/thk_ans_format_reward": 1.0, + "step": 1167, + "think_completion_length": 53.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.3125, + "epoch": 1.9713322091062393, + "grad_norm": 8.427958007136285, + "kl": 0.474609375, + "learning_rate": 6.06070826306914e-07, + "loss": 0.0005, + "reward": 3.0173048973083496, + "reward_std": 0.37154044955968857, + "rewards/final_reward": 0.867946832341649, + "rewards/mask_iou_reward": 0.4339734161708245, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0173049569129944, + "rewards/thk_ans_format_reward": 1.0, + "step": 1168, + "think_completion_length": 51.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.625, + "epoch": 1.973018549747049, + "grad_norm": 7.79990454182524, + "kl": 0.48828125, + "learning_rate": 6.057335581787521e-07, + "loss": 0.0005, + "reward": 2.6458925008773804, + "reward_std": 0.4773362725973129, + "rewards/final_reward": 0.6373366666011101, + "rewards/mask_iou_reward": 0.31866833330055505, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6458925604820251, + "rewards/thk_ans_format_reward": 1.0, + "step": 1169, + "think_completion_length": 47.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.625, + "epoch": 1.9747048903878583, + "grad_norm": 4.4719299493800975, + "kl": 0.4609375, + "learning_rate": 6.053962900505903e-07, + "loss": 0.0004, + "reward": 3.413856029510498, + "reward_std": 0.014244536869227886, + "rewards/final_reward": 1.2102401186865768, + "rewards/mask_iou_reward": 0.6051200593432884, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.413856029510498, + "rewards/thk_ans_format_reward": 1.0, + "step": 1170, + "think_completion_length": 46.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.03125, + "epoch": 1.9763912310286678, + "grad_norm": 6.177077156973483, + "kl": 0.5625, + "learning_rate": 6.050590219224283e-07, + "loss": 0.0006, + "reward": 2.5054807662963867, + "reward_std": 0.06103113852441311, + "rewards/final_reward": 0.9434630830829109, + "rewards/mask_iou_reward": 0.47173154154145547, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5054805800318718, + "rewards/thk_ans_format_reward": 1.0, + "step": 1171, + "think_completion_length": 53.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.671875, + "epoch": 1.9780775716694774, + "grad_norm": 7.89724293175771, + "kl": 0.498046875, + "learning_rate": 6.047217537942663e-07, + "loss": 0.0005, + "reward": 3.3290287256240845, + "reward_std": 0.16214152611792088, + "rewards/final_reward": 1.0813684613372112, + "rewards/mask_iou_reward": 0.5406842306686056, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.3446537256240845, + "rewards/thk_ans_format_reward": 1.0, + "step": 1172, + "think_completion_length": 42.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.53125, + "epoch": 1.9797639123102866, + "grad_norm": 7.723278921818194, + "kl": 0.494140625, + "learning_rate": 6.043844856661045e-07, + "loss": 0.0005, + "reward": 3.550881505012512, + "reward_std": 0.20284011587500572, + "rewards/final_reward": 1.8957957185039027, + "rewards/mask_iou_reward": 0.9478978592519514, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5508816242218018, + "rewards/thk_ans_format_reward": 1.0, + "step": 1173, + "think_completion_length": 45.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.0, + "epoch": 1.9814502529510962, + "grad_norm": 13.127430971098534, + "kl": 0.57421875, + "learning_rate": 6.040472175379426e-07, + "loss": 0.0006, + "reward": 3.3331719636917114, + "reward_std": 0.01610415242612362, + "rewards/final_reward": 1.524027746238251, + "rewards/mask_iou_reward": 0.7620138731191255, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.333171784877777, + "rewards/thk_ans_format_reward": 1.0, + "step": 1174, + "think_completion_length": 41.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.3125, + "epoch": 1.9831365935919054, + "grad_norm": 17.88970728571287, + "kl": 0.4970703125, + "learning_rate": 6.037099494097807e-07, + "loss": 0.0005, + "reward": 3.2474048137664795, + "reward_std": 0.12032559514045715, + "rewards/final_reward": 1.0106248733440577, + "rewards/mask_iou_reward": 0.5053124366720289, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2474048137664795, + "rewards/thk_ans_format_reward": 1.0, + "step": 1175, + "think_completion_length": 40.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.09375, + "epoch": 1.984822934232715, + "grad_norm": 5.86577008258473, + "kl": 0.5, + "learning_rate": 6.033726812816189e-07, + "loss": 0.0005, + "reward": 3.3224263191223145, + "reward_std": 0.11347942799329758, + "rewards/final_reward": 1.2982229708882325, + "rewards/mask_iou_reward": 0.6491114854441162, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.322426199913025, + "rewards/thk_ans_format_reward": 1.0, + "step": 1176, + "think_completion_length": 44.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.15625, + "epoch": 1.9865092748735245, + "grad_norm": 13.214045562525113, + "kl": 0.46875, + "learning_rate": 6.03035413153457e-07, + "loss": 0.0004, + "reward": 3.273212194442749, + "reward_std": 0.15405417047441006, + "rewards/final_reward": 0.9980004495482484, + "rewards/mask_iou_reward": 0.4990002247741242, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2732122540473938, + "rewards/thk_ans_format_reward": 1.0, + "step": 1177, + "think_completion_length": 44.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.765625, + "epoch": 1.9881956155143339, + "grad_norm": 7.791891780496249, + "kl": 0.50390625, + "learning_rate": 6.026981450252951e-07, + "loss": 0.0005, + "reward": 3.2183451652526855, + "reward_std": 0.33510997891426086, + "rewards/final_reward": 1.4571303503080095, + "rewards/mask_iou_reward": 0.7285651751540048, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2183451652526855, + "rewards/thk_ans_format_reward": 1.0, + "step": 1178, + "think_completion_length": 54.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.25, + "epoch": 1.9898819561551433, + "grad_norm": 6.02650274959712, + "kl": 0.501953125, + "learning_rate": 6.023608768971333e-07, + "loss": 0.0005, + "reward": 3.1446791887283325, + "reward_std": 0.5253532081842422, + "rewards/final_reward": 1.4351317064026219, + "rewards/mask_iou_reward": 0.7175658532013109, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.144679307937622, + "rewards/thk_ans_format_reward": 1.0, + "step": 1179, + "think_completion_length": 38.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.796875, + "epoch": 1.9915682967959527, + "grad_norm": 7.805565918911529, + "kl": 0.4658203125, + "learning_rate": 6.020236087689713e-07, + "loss": 0.0005, + "reward": 3.2576130628585815, + "reward_std": 0.07200890593230724, + "rewards/final_reward": 0.6822467371243718, + "rewards/mask_iou_reward": 0.3411233685621859, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.257613182067871, + "rewards/thk_ans_format_reward": 1.0, + "step": 1180, + "think_completion_length": 46.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.203125, + "epoch": 1.9932546374367623, + "grad_norm": 8.633385502886387, + "kl": 0.4501953125, + "learning_rate": 6.016863406408093e-07, + "loss": 0.0004, + "reward": 3.324090003967285, + "reward_std": 0.198069479316473, + "rewards/final_reward": 1.0850171364270256, + "rewards/mask_iou_reward": 0.5425085682135128, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3240899443626404, + "rewards/thk_ans_format_reward": 1.0, + "step": 1181, + "think_completion_length": 46.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.765625, + "epoch": 1.9949409780775715, + "grad_norm": 4.1814855348800295, + "kl": 0.4853515625, + "learning_rate": 6.013490725126475e-07, + "loss": 0.0005, + "reward": 3.1244304180145264, + "reward_std": 0.0062828969093970954, + "rewards/final_reward": 0.4318805218001468, + "rewards/mask_iou_reward": 0.2159402609000734, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1244304180145264, + "rewards/thk_ans_format_reward": 1.0, + "step": 1182, + "think_completion_length": 46.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.8125, + "epoch": 1.9966273187183812, + "grad_norm": 12.505994574842456, + "kl": 0.5107421875, + "learning_rate": 6.010118043844856e-07, + "loss": 0.0005, + "reward": 2.967620015144348, + "reward_std": 0.14106937497854233, + "rewards/final_reward": 0.7828099518289345, + "rewards/mask_iou_reward": 0.39140497591446727, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9676199853420258, + "rewards/thk_ans_format_reward": 1.0, + "step": 1183, + "think_completion_length": 45.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.66666793823242, + "epoch": 1.9983136593591906, + "grad_norm": 75.3561607357473, + "kl": 0.4482421875, + "learning_rate": 6.006745362563238e-07, + "loss": 0.0004, + "reward": 3.516904592514038, + "reward_std": 0.023798184003680944, + "rewards/final_reward": 1.5131991709008814, + "rewards/mask_iou_reward": 0.7565995854504407, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5169046521186829, + "rewards/thk_ans_format_reward": 1.0, + "step": 1184, + "think_completion_length": 47.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.625, + "epoch": 2.0016863406408096, + "grad_norm": 13.48689102759712, + "kl": 0.49609375, + "learning_rate": 6.003372681281619e-07, + "loss": 0.0005, + "reward": 2.9429726600646973, + "reward_std": 0.23621351923793554, + "rewards/final_reward": 0.49316087068991743, + "rewards/mask_iou_reward": 0.24658043534495871, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9429725408554077, + "rewards/thk_ans_format_reward": 1.0, + "step": 1185, + "think_completion_length": 48.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.4375, + "epoch": 2.003372681281619, + "grad_norm": 8.146315588376712, + "kl": 0.52734375, + "learning_rate": 6e-07, + "loss": 0.0005, + "reward": 3.3868921995162964, + "reward_std": 0.21877353638410568, + "rewards/final_reward": 1.0489110172147114, + "rewards/mask_iou_reward": 0.5244555086073557, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3868921399116516, + "rewards/thk_ans_format_reward": 1.0, + "step": 1186, + "think_completion_length": 43.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.171875, + "epoch": 2.0050590219224285, + "grad_norm": 14.54303439688115, + "kl": 0.4765625, + "learning_rate": 5.996627318718382e-07, + "loss": 0.0005, + "reward": 3.432328224182129, + "reward_std": 0.12002099305391312, + "rewards/final_reward": 1.6099766606977879, + "rewards/mask_iou_reward": 0.8049883303488939, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4323282837867737, + "rewards/thk_ans_format_reward": 1.0, + "step": 1187, + "think_completion_length": 49.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.875, + "epoch": 2.0067453625632377, + "grad_norm": 5.788675497399444, + "kl": 0.4755859375, + "learning_rate": 5.993254637436763e-07, + "loss": 0.0005, + "reward": 3.4901273250579834, + "reward_std": 0.198878675699234, + "rewards/final_reward": 1.3744566459246905, + "rewards/mask_iou_reward": 0.6872283229623453, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4901273250579834, + "rewards/thk_ans_format_reward": 1.0, + "step": 1188, + "think_completion_length": 42.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.140625, + "epoch": 2.0084317032040473, + "grad_norm": 4.644312190596549, + "kl": 0.515625, + "learning_rate": 5.989881956155142e-07, + "loss": 0.0005, + "reward": 2.759495258331299, + "reward_std": 0.20145833492279053, + "rewards/final_reward": 0.5538062995088217, + "rewards/mask_iou_reward": 0.2769031497544108, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7594953179359436, + "rewards/thk_ans_format_reward": 1.0, + "step": 1189, + "think_completion_length": 46.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.265625, + "epoch": 2.0101180438448565, + "grad_norm": 7.67349455004292, + "kl": 0.498046875, + "learning_rate": 5.986509274873524e-07, + "loss": 0.0005, + "reward": 3.634737968444824, + "reward_std": 0.20400644093751907, + "rewards/final_reward": 1.9044805719221032, + "rewards/mask_iou_reward": 0.9522402859610516, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.634738028049469, + "rewards/thk_ans_format_reward": 1.0, + "step": 1190, + "think_completion_length": 41.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.859375, + "epoch": 2.011804384485666, + "grad_norm": 8.301429030110507, + "kl": 0.48828125, + "learning_rate": 5.983136593591905e-07, + "loss": 0.0005, + "reward": 3.4187628030776978, + "reward_std": 0.2773596942424774, + "rewards/final_reward": 1.4393433284356094, + "rewards/mask_iou_reward": 0.7196716642178047, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.418762981891632, + "rewards/thk_ans_format_reward": 1.0, + "step": 1191, + "think_completion_length": 45.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.03125, + "epoch": 2.0134907251264758, + "grad_norm": 6.439932075838745, + "kl": 0.51171875, + "learning_rate": 5.979763912310286e-07, + "loss": 0.0005, + "reward": 3.4730443954467773, + "reward_std": 0.04087753966450691, + "rewards/final_reward": 1.5909392846397068, + "rewards/mask_iou_reward": 0.7954696423198534, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.473044514656067, + "rewards/thk_ans_format_reward": 1.0, + "step": 1192, + "think_completion_length": 40.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.90625, + "epoch": 2.015177065767285, + "grad_norm": 8.227312462160738, + "kl": 0.4912109375, + "learning_rate": 5.976391231028668e-07, + "loss": 0.0005, + "reward": 3.2172244787216187, + "reward_std": 0.18251924961805344, + "rewards/final_reward": 1.179570640819125, + "rewards/mask_iou_reward": 0.5897853204095626, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2172245979309082, + "rewards/thk_ans_format_reward": 1.0, + "step": 1193, + "think_completion_length": 46.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.03125, + "epoch": 2.0168634064080946, + "grad_norm": 8.540116550794291, + "kl": 0.509765625, + "learning_rate": 5.973018549747049e-07, + "loss": 0.0005, + "reward": 2.839731454849243, + "reward_std": 0.11079123802483082, + "rewards/final_reward": 1.239055328482462, + "rewards/mask_iou_reward": 0.619527664241231, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8397313356399536, + "rewards/thk_ans_format_reward": 1.0, + "step": 1194, + "think_completion_length": 39.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.734375, + "epoch": 2.0185497470489038, + "grad_norm": 5.567108294211803, + "kl": 0.4599609375, + "learning_rate": 5.96964586846543e-07, + "loss": 0.0005, + "reward": 3.490329623222351, + "reward_std": 0.0630667507648468, + "rewards/final_reward": 1.4267507476745882, + "rewards/mask_iou_reward": 0.7133753738372941, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4903295636177063, + "rewards/thk_ans_format_reward": 1.0, + "step": 1195, + "think_completion_length": 39.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.015625, + "epoch": 2.0202360876897134, + "grad_norm": 7.514581447833132, + "kl": 0.4443359375, + "learning_rate": 5.966273187183812e-07, + "loss": 0.0005, + "reward": 3.437077522277832, + "reward_std": 0.33910364657640457, + "rewards/final_reward": 1.6282311242662053, + "rewards/mask_iou_reward": 0.8141155621331027, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4370774626731873, + "rewards/thk_ans_format_reward": 1.0, + "step": 1196, + "think_completion_length": 44.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.875, + "epoch": 2.0219224283305226, + "grad_norm": 9.150943576049919, + "kl": 0.529296875, + "learning_rate": 5.962900505902191e-07, + "loss": 0.0005, + "reward": 3.362897038459778, + "reward_std": 0.18507951498031616, + "rewards/final_reward": 1.7366566859495336, + "rewards/mask_iou_reward": 0.8683283429747668, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3628970384597778, + "rewards/thk_ans_format_reward": 1.0, + "step": 1197, + "think_completion_length": 43.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.5, + "epoch": 2.0236087689713322, + "grad_norm": 6.336774089846376, + "kl": 0.501953125, + "learning_rate": 5.959527824620572e-07, + "loss": 0.0005, + "reward": 3.3588712215423584, + "reward_std": 0.18687545135617256, + "rewards/final_reward": 1.085339322650293, + "rewards/mask_iou_reward": 0.5426696613251465, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3588712215423584, + "rewards/thk_ans_format_reward": 1.0, + "step": 1198, + "think_completion_length": 47.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.921875, + "epoch": 2.0252951096121414, + "grad_norm": 10.164923995181558, + "kl": 0.54296875, + "learning_rate": 5.956155143338954e-07, + "loss": 0.0005, + "reward": 2.4154281616210938, + "reward_std": 0.11479150131344795, + "rewards/final_reward": 0.18002585004279, + "rewards/mask_iou_reward": 0.090012925021395, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.41542813181877136, + "rewards/thk_ans_format_reward": 1.0, + "step": 1199, + "think_completion_length": 43.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.203125, + "epoch": 2.026981450252951, + "grad_norm": 6.859588749252243, + "kl": 0.4638671875, + "learning_rate": 5.952782462057335e-07, + "loss": 0.0005, + "reward": 3.202408790588379, + "reward_std": 0.0297409575432539, + "rewards/final_reward": 1.5953461359410652, + "rewards/mask_iou_reward": 0.7976730679705326, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2024087607860565, + "rewards/thk_ans_format_reward": 1.0, + "step": 1200, + "think_completion_length": 45.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.0, + "epoch": 2.0286677908937607, + "grad_norm": 10.968961604711549, + "kl": 0.5859375, + "learning_rate": 5.949409780775716e-07, + "loss": 0.0006, + "reward": 2.9323331117630005, + "reward_std": 0.06923278025351465, + "rewards/final_reward": 0.7952303924692466, + "rewards/mask_iou_reward": 0.3976151962346233, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9323331415653229, + "rewards/thk_ans_format_reward": 1.0, + "step": 1201, + "think_completion_length": 40.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.4375, + "epoch": 2.03035413153457, + "grad_norm": 5.190037758988362, + "kl": 0.556640625, + "learning_rate": 5.946037099494098e-07, + "loss": 0.0005, + "reward": 2.9672462940216064, + "reward_std": 0.06309534143656492, + "rewards/final_reward": 1.3002051463077566, + "rewards/mask_iou_reward": 0.6501025731538783, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9672463536262512, + "rewards/thk_ans_format_reward": 1.0, + "step": 1202, + "think_completion_length": 47.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.46875, + "epoch": 2.0320404721753795, + "grad_norm": 48.15014122950417, + "kl": 0.5029296875, + "learning_rate": 5.942664418212479e-07, + "loss": 0.0005, + "reward": 3.2930922508239746, + "reward_std": 0.1110190600156784, + "rewards/final_reward": 1.0016735899585203, + "rewards/mask_iou_reward": 0.5008367949792601, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2930924892425537, + "rewards/thk_ans_format_reward": 1.0, + "step": 1203, + "think_completion_length": 48.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.59375, + "epoch": 2.0337268128161887, + "grad_norm": 12.588029094533338, + "kl": 0.5029296875, + "learning_rate": 5.93929173693086e-07, + "loss": 0.0005, + "reward": 3.519856095314026, + "reward_std": 0.05847097374498844, + "rewards/final_reward": 1.4558192890655186, + "rewards/mask_iou_reward": 0.7279096445327593, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5198561549186707, + "rewards/thk_ans_format_reward": 1.0, + "step": 1204, + "think_completion_length": 47.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.453125, + "epoch": 2.0354131534569984, + "grad_norm": 3.8998339647410987, + "kl": 0.55078125, + "learning_rate": 5.935919055649242e-07, + "loss": 0.0005, + "reward": 2.5957932472229004, + "reward_std": 0.07152135111391544, + "rewards/final_reward": 0.6404311645514321, + "rewards/mask_iou_reward": 0.32021558227571606, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5957932770252228, + "rewards/thk_ans_format_reward": 1.0, + "step": 1205, + "think_completion_length": 49.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.9375, + "epoch": 2.0370994940978076, + "grad_norm": 9.934504424693726, + "kl": 0.5078125, + "learning_rate": 5.932546374367621e-07, + "loss": 0.0005, + "reward": 2.9279143810272217, + "reward_std": 0.1831659022718668, + "rewards/final_reward": 1.4839509979409318, + "rewards/mask_iou_reward": 0.7419754989704659, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9279144108295441, + "rewards/thk_ans_format_reward": 1.0, + "step": 1206, + "think_completion_length": 44.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.625, + "epoch": 2.038785834738617, + "grad_norm": 6.8848677535750955, + "kl": 0.513671875, + "learning_rate": 5.929173693086002e-07, + "loss": 0.0005, + "reward": 3.8089691400527954, + "reward_std": 0.10258529148995876, + "rewards/final_reward": 1.7815592827991056, + "rewards/mask_iou_reward": 0.8907796413995528, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8089691400527954, + "rewards/thk_ans_format_reward": 1.0, + "step": 1207, + "think_completion_length": 50.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.09375, + "epoch": 2.040472175379427, + "grad_norm": 10.670956222703316, + "kl": 0.544921875, + "learning_rate": 5.925801011804384e-07, + "loss": 0.0005, + "reward": 3.71175754070282, + "reward_std": 0.09188080579042435, + "rewards/final_reward": 1.552977126938897, + "rewards/mask_iou_reward": 0.7764885634694485, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7117576003074646, + "rewards/thk_ans_format_reward": 1.0, + "step": 1208, + "think_completion_length": 43.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.171875, + "epoch": 2.042158516020236, + "grad_norm": 5.359893518780748, + "kl": 0.63671875, + "learning_rate": 5.922428330522765e-07, + "loss": 0.0006, + "reward": 3.751399278640747, + "reward_std": 0.13887044158764184, + "rewards/final_reward": 1.795235393339603, + "rewards/mask_iou_reward": 0.8976176966698015, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.751399278640747, + "rewards/thk_ans_format_reward": 1.0, + "step": 1209, + "think_completion_length": 45.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.359375, + "epoch": 2.0438448566610457, + "grad_norm": 5.770311558844356, + "kl": 0.533203125, + "learning_rate": 5.919055649241147e-07, + "loss": 0.0005, + "reward": 3.644760251045227, + "reward_std": 0.05879488307982683, + "rewards/final_reward": 1.622706724028792, + "rewards/mask_iou_reward": 0.811353362014396, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6447601914405823, + "rewards/thk_ans_format_reward": 1.0, + "step": 1210, + "think_completion_length": 47.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.375, + "epoch": 2.045531197301855, + "grad_norm": 20.427617565995828, + "kl": 0.49609375, + "learning_rate": 5.915682967959528e-07, + "loss": 0.0005, + "reward": 3.641817569732666, + "reward_std": 0.05982916243374348, + "rewards/final_reward": 1.7792524175183502, + "rewards/mask_iou_reward": 0.8896262087591751, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6418176293373108, + "rewards/thk_ans_format_reward": 1.0, + "step": 1211, + "think_completion_length": 42.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.328125, + "epoch": 2.0472175379426645, + "grad_norm": 25.937607637845336, + "kl": 0.501953125, + "learning_rate": 5.912310286677909e-07, + "loss": 0.0005, + "reward": 3.0757253170013428, + "reward_std": 0.22424892336130142, + "rewards/final_reward": 1.3770070944798838, + "rewards/mask_iou_reward": 0.6885035472399419, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0757253766059875, + "rewards/thk_ans_format_reward": 1.0, + "step": 1212, + "think_completion_length": 46.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.796875, + "epoch": 2.0489038785834737, + "grad_norm": 7.261303365634824, + "kl": 0.451171875, + "learning_rate": 5.908937605396291e-07, + "loss": 0.0005, + "reward": 3.3862454891204834, + "reward_std": 0.09621530398726463, + "rewards/final_reward": 1.1227074698332027, + "rewards/mask_iou_reward": 0.5613537349166013, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3862455487251282, + "rewards/thk_ans_format_reward": 1.0, + "step": 1213, + "think_completion_length": 48.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.578125, + "epoch": 2.0505902192242833, + "grad_norm": 6.27675477947036, + "kl": 0.50390625, + "learning_rate": 5.905564924114671e-07, + "loss": 0.0005, + "reward": 3.8144431114196777, + "reward_std": 0.050515939481556416, + "rewards/final_reward": 1.823427526112407, + "rewards/mask_iou_reward": 0.9117137630562036, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8144429326057434, + "rewards/thk_ans_format_reward": 1.0, + "step": 1214, + "think_completion_length": 52.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.453125, + "epoch": 2.052276559865093, + "grad_norm": 15.280398429391834, + "kl": 0.490234375, + "learning_rate": 5.902192242833051e-07, + "loss": 0.0005, + "reward": 3.0774126052856445, + "reward_std": 0.22712376341223717, + "rewards/final_reward": 1.1759968898950153, + "rewards/mask_iou_reward": 0.5879984449475076, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.077412635087967, + "rewards/thk_ans_format_reward": 1.0, + "step": 1215, + "think_completion_length": 45.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.171875, + "epoch": 2.053962900505902, + "grad_norm": 25.440565612456673, + "kl": 0.4755859375, + "learning_rate": 5.898819561551433e-07, + "loss": 0.0005, + "reward": 3.495232939720154, + "reward_std": 0.16499481350183487, + "rewards/final_reward": 1.3941976278260753, + "rewards/mask_iou_reward": 0.6970988139130376, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.495232880115509, + "rewards/thk_ans_format_reward": 1.0, + "step": 1216, + "think_completion_length": 43.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.734375, + "epoch": 2.0556492411467118, + "grad_norm": 19.52427442013993, + "kl": 0.568359375, + "learning_rate": 5.895446880269814e-07, + "loss": 0.0006, + "reward": 3.5315277576446533, + "reward_std": 0.09806636191206053, + "rewards/final_reward": 1.7410238492657086, + "rewards/mask_iou_reward": 0.8705119246328543, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5315275192260742, + "rewards/thk_ans_format_reward": 1.0, + "step": 1217, + "think_completion_length": 43.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.78125, + "epoch": 2.057335581787521, + "grad_norm": 7.077278333938166, + "kl": 0.48828125, + "learning_rate": 5.892074198988195e-07, + "loss": 0.0005, + "reward": 3.1986727714538574, + "reward_std": 0.15416544303297997, + "rewards/final_reward": 0.526140168463072, + "rewards/mask_iou_reward": 0.263070084231536, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1986727714538574, + "rewards/thk_ans_format_reward": 1.0, + "step": 1218, + "think_completion_length": 45.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.59375, + "epoch": 2.0590219224283306, + "grad_norm": 12.056694496829845, + "kl": 0.513671875, + "learning_rate": 5.888701517706577e-07, + "loss": 0.0005, + "reward": 3.175121545791626, + "reward_std": 0.24305326491594315, + "rewards/final_reward": 1.5874991299200971, + "rewards/mask_iou_reward": 0.7937495649600486, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1751216650009155, + "rewards/thk_ans_format_reward": 1.0, + "step": 1219, + "think_completion_length": 50.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.640625, + "epoch": 2.06070826306914, + "grad_norm": 8.927632556503001, + "kl": 0.5234375, + "learning_rate": 5.885328836424958e-07, + "loss": 0.0005, + "reward": 3.10360050201416, + "reward_std": 0.3249269500374794, + "rewards/final_reward": 0.7365527669777219, + "rewards/mask_iou_reward": 0.36827638348886094, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1036004424095154, + "rewards/thk_ans_format_reward": 1.0, + "step": 1220, + "think_completion_length": 39.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.234375, + "epoch": 2.0623946037099494, + "grad_norm": 6.797062459675585, + "kl": 0.482421875, + "learning_rate": 5.881956155143339e-07, + "loss": 0.0005, + "reward": 2.975393295288086, + "reward_std": 0.085931153036654, + "rewards/final_reward": 1.4435219891230973, + "rewards/mask_iou_reward": 0.7217609945615486, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9753932952880859, + "rewards/thk_ans_format_reward": 1.0, + "step": 1221, + "think_completion_length": 42.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.265625, + "epoch": 2.064080944350759, + "grad_norm": 25.28742553611422, + "kl": 0.505859375, + "learning_rate": 5.87858347386172e-07, + "loss": 0.0005, + "reward": 3.0978381633758545, + "reward_std": 0.2666756585240364, + "rewards/final_reward": 1.2287549133668487, + "rewards/mask_iou_reward": 0.6143774566834244, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0978381633758545, + "rewards/thk_ans_format_reward": 1.0, + "step": 1222, + "think_completion_length": 48.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.71875, + "epoch": 2.0657672849915683, + "grad_norm": 9.439762606777863, + "kl": 0.52734375, + "learning_rate": 5.875210792580101e-07, + "loss": 0.0005, + "reward": 3.324112296104431, + "reward_std": 0.16529548168182373, + "rewards/final_reward": 1.195829944225152, + "rewards/mask_iou_reward": 0.597914972112576, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.324112355709076, + "rewards/thk_ans_format_reward": 1.0, + "step": 1223, + "think_completion_length": 45.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.921875, + "epoch": 2.067453625632378, + "grad_norm": 11.069853275445752, + "kl": 0.546875, + "learning_rate": 5.871838111298481e-07, + "loss": 0.0005, + "reward": 3.2145529985427856, + "reward_std": 0.12553077191114426, + "rewards/final_reward": 1.1087564448687577, + "rewards/mask_iou_reward": 0.5543782224343788, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.214552879333496, + "rewards/thk_ans_format_reward": 1.0, + "step": 1224, + "think_completion_length": 50.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.6875, + "epoch": 2.069139966273187, + "grad_norm": 6.586658803173453, + "kl": 0.5498046875, + "learning_rate": 5.868465430016863e-07, + "loss": 0.0006, + "reward": 3.655348539352417, + "reward_std": 0.14335137605667114, + "rewards/final_reward": 1.6976510726235734, + "rewards/mask_iou_reward": 0.8488255363117867, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.655348539352417, + "rewards/thk_ans_format_reward": 1.0, + "step": 1225, + "think_completion_length": 43.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.171875, + "epoch": 2.0708263069139967, + "grad_norm": 7.341675970877156, + "kl": 0.509765625, + "learning_rate": 5.865092748735244e-07, + "loss": 0.0005, + "reward": 3.4260720014572144, + "reward_std": 0.38464102149009705, + "rewards/final_reward": 1.2654199398354669, + "rewards/mask_iou_reward": 0.6327099699177334, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.4416970610618591, + "rewards/thk_ans_format_reward": 1.0, + "step": 1226, + "think_completion_length": 48.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.03125, + "epoch": 2.072512647554806, + "grad_norm": 5.922152295385602, + "kl": 0.572265625, + "learning_rate": 5.861720067453625e-07, + "loss": 0.0006, + "reward": 3.173344850540161, + "reward_std": 0.0615835078060627, + "rewards/final_reward": 1.1209231989245945, + "rewards/mask_iou_reward": 0.5604615994622972, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1733447909355164, + "rewards/thk_ans_format_reward": 1.0, + "step": 1227, + "think_completion_length": 46.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.953125, + "epoch": 2.0741989881956155, + "grad_norm": 7.530184997434515, + "kl": 0.486328125, + "learning_rate": 5.858347386172007e-07, + "loss": 0.0005, + "reward": 3.558689832687378, + "reward_std": 0.06893946789205074, + "rewards/final_reward": 1.4888971286790864, + "rewards/mask_iou_reward": 0.7444485643395432, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5586897134780884, + "rewards/thk_ans_format_reward": 1.0, + "step": 1228, + "think_completion_length": 46.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.078125, + "epoch": 2.075885328836425, + "grad_norm": 7.809389484619535, + "kl": 0.505859375, + "learning_rate": 5.854974704890388e-07, + "loss": 0.0005, + "reward": 2.6829360723495483, + "reward_std": 0.31088511645793915, + "rewards/final_reward": 0.8371374268188949, + "rewards/mask_iou_reward": 0.41856871340944746, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6829361021518707, + "rewards/thk_ans_format_reward": 1.0, + "step": 1229, + "think_completion_length": 44.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.046875, + "epoch": 2.0775716694772344, + "grad_norm": 5.389053395617888, + "kl": 0.54296875, + "learning_rate": 5.851602023608768e-07, + "loss": 0.0005, + "reward": 3.397468686103821, + "reward_std": 0.11730508506298065, + "rewards/final_reward": 1.1659759993446794, + "rewards/mask_iou_reward": 0.5829879996723397, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3974686861038208, + "rewards/thk_ans_format_reward": 1.0, + "step": 1230, + "think_completion_length": 40.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.890625, + "epoch": 2.079258010118044, + "grad_norm": 18.292579630418203, + "kl": 0.515625, + "learning_rate": 5.84822934232715e-07, + "loss": 0.0005, + "reward": 3.2734872102737427, + "reward_std": 0.16774200648069382, + "rewards/final_reward": 1.4035551535338837, + "rewards/mask_iou_reward": 0.7017775767669419, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2734872102737427, + "rewards/thk_ans_format_reward": 1.0, + "step": 1231, + "think_completion_length": 53.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.515625, + "epoch": 2.080944350758853, + "grad_norm": 9.232430436849269, + "kl": 0.50390625, + "learning_rate": 5.84485666104553e-07, + "loss": 0.0005, + "reward": 3.0458027124404907, + "reward_std": 0.21156837791204453, + "rewards/final_reward": 1.0424462781126267, + "rewards/mask_iou_reward": 0.5212231390563133, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0458028018474579, + "rewards/thk_ans_format_reward": 1.0, + "step": 1232, + "think_completion_length": 49.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.1875, + "epoch": 2.082630691399663, + "grad_norm": 11.183935733331714, + "kl": 0.4892578125, + "learning_rate": 5.841483979763911e-07, + "loss": 0.0005, + "reward": 3.0968226194381714, + "reward_std": 0.33744121342897415, + "rewards/final_reward": 1.1910839599885503, + "rewards/mask_iou_reward": 0.5955419799942752, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0968225598335266, + "rewards/thk_ans_format_reward": 1.0, + "step": 1233, + "think_completion_length": 52.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.015625, + "epoch": 2.084317032040472, + "grad_norm": 11.91292635625383, + "kl": 0.5234375, + "learning_rate": 5.838111298482293e-07, + "loss": 0.0005, + "reward": 3.551935076713562, + "reward_std": 0.10081242024898529, + "rewards/final_reward": 1.4946258071000098, + "rewards/mask_iou_reward": 0.7473129035500049, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.551935076713562, + "rewards/thk_ans_format_reward": 1.0, + "step": 1234, + "think_completion_length": 47.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.609375, + "epoch": 2.0860033726812817, + "grad_norm": 7.440830997494622, + "kl": 0.546875, + "learning_rate": 5.834738617200674e-07, + "loss": 0.0006, + "reward": 3.7355228662490845, + "reward_std": 0.10745073819998652, + "rewards/final_reward": 1.9838178236248292, + "rewards/mask_iou_reward": 0.9919089118124146, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7355228066444397, + "rewards/thk_ans_format_reward": 1.0, + "step": 1235, + "think_completion_length": 50.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.078125, + "epoch": 2.087689713322091, + "grad_norm": 7.373616608509708, + "kl": 0.484375, + "learning_rate": 5.831365935919056e-07, + "loss": 0.0005, + "reward": 3.488574981689453, + "reward_std": 0.03398977406322956, + "rewards/final_reward": 1.6369875219333916, + "rewards/mask_iou_reward": 0.8184937609666958, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4885749220848083, + "rewards/thk_ans_format_reward": 1.0, + "step": 1236, + "think_completion_length": 46.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.8125, + "epoch": 2.0893760539629005, + "grad_norm": 9.540775863697288, + "kl": 0.6328125, + "learning_rate": 5.827993254637437e-07, + "loss": 0.0006, + "reward": 3.630674719810486, + "reward_std": 0.3000973165035248, + "rewards/final_reward": 1.5710772669419548, + "rewards/mask_iou_reward": 0.7855386334709774, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6306747198104858, + "rewards/thk_ans_format_reward": 1.0, + "step": 1237, + "think_completion_length": 49.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.53125, + "epoch": 2.09106239460371, + "grad_norm": 10.590781360485371, + "kl": 0.6044921875, + "learning_rate": 5.824620573355818e-07, + "loss": 0.0006, + "reward": 3.2602202892303467, + "reward_std": 0.03827218525111675, + "rewards/final_reward": 1.7720339794504287, + "rewards/mask_iou_reward": 0.8860169897252144, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.260220319032669, + "rewards/thk_ans_format_reward": 1.0, + "step": 1238, + "think_completion_length": 49.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.703125, + "epoch": 2.0927487352445193, + "grad_norm": 16.82641766082157, + "kl": 0.5390625, + "learning_rate": 5.821247892074199e-07, + "loss": 0.0005, + "reward": 3.7846381664276123, + "reward_std": 0.09372981078922749, + "rewards/final_reward": 1.7675882258712055, + "rewards/mask_iou_reward": 0.8837941129356027, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7846381068229675, + "rewards/thk_ans_format_reward": 1.0, + "step": 1239, + "think_completion_length": 56.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.328125, + "epoch": 2.094435075885329, + "grad_norm": 8.521619117103281, + "kl": 0.533203125, + "learning_rate": 5.81787521079258e-07, + "loss": 0.0005, + "reward": 3.1291738748550415, + "reward_std": 0.06243935413658619, + "rewards/final_reward": 0.5839306049119234, + "rewards/mask_iou_reward": 0.2919653024559617, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1291736364364624, + "rewards/thk_ans_format_reward": 1.0, + "step": 1240, + "think_completion_length": 44.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.875, + "epoch": 2.096121416526138, + "grad_norm": 3.4547751255274695, + "kl": 0.51953125, + "learning_rate": 5.81450252951096e-07, + "loss": 0.0005, + "reward": 3.304155707359314, + "reward_std": 0.009104110868065618, + "rewards/final_reward": 1.6302786885188945, + "rewards/mask_iou_reward": 0.8151393442594472, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3041555881500244, + "rewards/thk_ans_format_reward": 1.0, + "step": 1241, + "think_completion_length": 54.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.375, + "epoch": 2.097807757166948, + "grad_norm": 12.224812085536941, + "kl": 0.48046875, + "learning_rate": 5.811129848229342e-07, + "loss": 0.0005, + "reward": 3.108389377593994, + "reward_std": 0.2590280845761299, + "rewards/final_reward": 1.6332673514867977, + "rewards/mask_iou_reward": 0.8166336757433988, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1083892583847046, + "rewards/thk_ans_format_reward": 1.0, + "step": 1242, + "think_completion_length": 48.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.140625, + "epoch": 2.099494097807757, + "grad_norm": 12.419841778007031, + "kl": 0.462890625, + "learning_rate": 5.807757166947723e-07, + "loss": 0.0005, + "reward": 3.318019986152649, + "reward_std": 0.0800204686820507, + "rewards/final_reward": 1.2042707380623143, + "rewards/mask_iou_reward": 0.6021353690311572, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3180198669433594, + "rewards/thk_ans_format_reward": 1.0, + "step": 1243, + "think_completion_length": 50.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.75, + "epoch": 2.1011804384485666, + "grad_norm": 7.57509628016737, + "kl": 0.556640625, + "learning_rate": 5.804384485666104e-07, + "loss": 0.0006, + "reward": 3.264046311378479, + "reward_std": 0.0792916975915432, + "rewards/final_reward": 1.637747794359674, + "rewards/mask_iou_reward": 0.818873897179837, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2640464305877686, + "rewards/thk_ans_format_reward": 1.0, + "step": 1244, + "think_completion_length": 44.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.625, + "epoch": 2.1028667790893762, + "grad_norm": 46.38176561705908, + "kl": 0.4931640625, + "learning_rate": 5.801011804384486e-07, + "loss": 0.0005, + "reward": 2.916468858718872, + "reward_std": 0.31557588279247284, + "rewards/final_reward": 1.3054415203274055, + "rewards/mask_iou_reward": 0.6527207601637027, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9164688587188721, + "rewards/thk_ans_format_reward": 1.0, + "step": 1245, + "think_completion_length": 44.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.171875, + "epoch": 2.1045531197301854, + "grad_norm": 21.665840127244977, + "kl": 0.5, + "learning_rate": 5.797639123102867e-07, + "loss": 0.0005, + "reward": 3.4545528888702393, + "reward_std": 0.17094121873378754, + "rewards/final_reward": 1.7188712109586382, + "rewards/mask_iou_reward": 0.8594356054793191, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4545528292655945, + "rewards/thk_ans_format_reward": 1.0, + "step": 1246, + "think_completion_length": 45.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.578125, + "epoch": 2.106239460370995, + "grad_norm": 32.64713621094739, + "kl": 0.4765625, + "learning_rate": 5.794266441821247e-07, + "loss": 0.0005, + "reward": 3.6517735719680786, + "reward_std": 0.10199225321412086, + "rewards/final_reward": 1.5091516226489148, + "rewards/mask_iou_reward": 0.7545758113244574, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6517733931541443, + "rewards/thk_ans_format_reward": 1.0, + "step": 1247, + "think_completion_length": 48.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.484375, + "epoch": 2.1079258010118043, + "grad_norm": 7.1967143913972, + "kl": 0.48046875, + "learning_rate": 5.790893760539629e-07, + "loss": 0.0005, + "reward": 3.353150963783264, + "reward_std": 0.11774658411741257, + "rewards/final_reward": 1.748703370024331, + "rewards/mask_iou_reward": 0.8743516850121655, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3531509041786194, + "rewards/thk_ans_format_reward": 1.0, + "step": 1248, + "think_completion_length": 45.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.78125, + "epoch": 2.109612141652614, + "grad_norm": 55.73109806211543, + "kl": 0.509765625, + "learning_rate": 5.78752107925801e-07, + "loss": 0.0005, + "reward": 3.6747478246688843, + "reward_std": 0.10688724555075169, + "rewards/final_reward": 1.561100560613465, + "rewards/mask_iou_reward": 0.7805502803067325, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6747477650642395, + "rewards/thk_ans_format_reward": 1.0, + "step": 1249, + "think_completion_length": 43.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.78125, + "epoch": 2.111298482293423, + "grad_norm": 10.108456421636822, + "kl": 0.51171875, + "learning_rate": 5.78414839797639e-07, + "loss": 0.0005, + "reward": 3.0707567930221558, + "reward_std": 0.19155866652727127, + "rewards/final_reward": 0.6565987376431796, + "rewards/mask_iou_reward": 0.3282993688215898, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0707568526268005, + "rewards/thk_ans_format_reward": 1.0, + "step": 1250, + "think_completion_length": 53.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.09375, + "epoch": 2.1129848229342327, + "grad_norm": 13.009707743391786, + "kl": 0.484375, + "learning_rate": 5.780775716694772e-07, + "loss": 0.0005, + "reward": 3.369237780570984, + "reward_std": 0.17601485550403595, + "rewards/final_reward": 1.2133884638153491, + "rewards/mask_iou_reward": 0.6066942319076746, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3692377805709839, + "rewards/thk_ans_format_reward": 1.0, + "step": 1251, + "think_completion_length": 46.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.046875, + "epoch": 2.1146711635750424, + "grad_norm": 6.729008572861788, + "kl": 0.4697265625, + "learning_rate": 5.777403035413153e-07, + "loss": 0.0005, + "reward": 3.363896131515503, + "reward_std": 0.3527638018131256, + "rewards/final_reward": 1.2416063426888746, + "rewards/mask_iou_reward": 0.6208031713444373, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3638960123062134, + "rewards/thk_ans_format_reward": 1.0, + "step": 1252, + "think_completion_length": 49.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.40625, + "epoch": 2.1163575042158516, + "grad_norm": 782078.1110492643, + "kl": 235520.27734375, + "learning_rate": 5.774030354131534e-07, + "loss": 236.1547, + "reward": 3.241525888442993, + "reward_std": 0.1571502909064293, + "rewards/final_reward": 1.521762280870485, + "rewards/mask_iou_reward": 0.7608811404352425, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2571508586406708, + "rewards/thk_ans_format_reward": 0.984375, + "step": 1253, + "think_completion_length": 50.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.625, + "epoch": 2.118043844856661, + "grad_norm": 8.010863739990121, + "kl": 0.5009765625, + "learning_rate": 5.770657672849916e-07, + "loss": 0.0005, + "reward": 2.98062264919281, + "reward_std": 0.07590826600790024, + "rewards/final_reward": 0.8288698686168807, + "rewards/mask_iou_reward": 0.41443493430844036, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9806226491928101, + "rewards/thk_ans_format_reward": 1.0, + "step": 1254, + "think_completion_length": 56.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.484375, + "epoch": 2.1197301854974704, + "grad_norm": 17.237015051028266, + "kl": 0.5048828125, + "learning_rate": 5.767284991568296e-07, + "loss": 0.0005, + "reward": 3.5511358976364136, + "reward_std": 0.06634262204170227, + "rewards/final_reward": 1.6558075137380417, + "rewards/mask_iou_reward": 0.8279037568690208, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.551135778427124, + "rewards/thk_ans_format_reward": 1.0, + "step": 1255, + "think_completion_length": 45.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.421875, + "epoch": 2.12141652613828, + "grad_norm": 22.379463074628816, + "kl": 0.4677734375, + "learning_rate": 5.763912310286677e-07, + "loss": 0.0005, + "reward": 3.63353431224823, + "reward_std": 0.030599688179790974, + "rewards/final_reward": 1.3697178234861853, + "rewards/mask_iou_reward": 0.6848589117430927, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.63353431224823, + "rewards/thk_ans_format_reward": 1.0, + "step": 1256, + "think_completion_length": 47.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.546875, + "epoch": 2.123102866779089, + "grad_norm": 8.866637777736178, + "kl": 0.51171875, + "learning_rate": 5.760539629005059e-07, + "loss": 0.0005, + "reward": 3.4119359254837036, + "reward_std": 0.39036141335964203, + "rewards/final_reward": 1.3524345485826261, + "rewards/mask_iou_reward": 0.6762172742913131, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4119358658790588, + "rewards/thk_ans_format_reward": 1.0, + "step": 1257, + "think_completion_length": 41.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.75, + "epoch": 2.124789207419899, + "grad_norm": 7.424783872127569, + "kl": 0.490234375, + "learning_rate": 5.75716694772344e-07, + "loss": 0.0005, + "reward": 3.281718134880066, + "reward_std": 0.17376143485307693, + "rewards/final_reward": 1.2578834852308733, + "rewards/mask_iou_reward": 0.6289417426154367, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2817181050777435, + "rewards/thk_ans_format_reward": 1.0, + "step": 1258, + "think_completion_length": 52.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.046875, + "epoch": 2.126475548060708, + "grad_norm": 8.587208587499049, + "kl": 0.533203125, + "learning_rate": 5.753794266441822e-07, + "loss": 0.0006, + "reward": 3.0733892917633057, + "reward_std": 0.18975719437003136, + "rewards/final_reward": 0.683115645927564, + "rewards/mask_iou_reward": 0.341557822963782, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0733892023563385, + "rewards/thk_ans_format_reward": 1.0, + "step": 1259, + "think_completion_length": 45.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.734375, + "epoch": 2.1281618887015177, + "grad_norm": 8.018608509495962, + "kl": 0.474609375, + "learning_rate": 5.750421585160202e-07, + "loss": 0.0005, + "reward": 3.535109758377075, + "reward_std": 0.08410511817783117, + "rewards/final_reward": 1.7809249511905496, + "rewards/mask_iou_reward": 0.8904624755952748, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5351097583770752, + "rewards/thk_ans_format_reward": 1.0, + "step": 1260, + "think_completion_length": 39.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.859375, + "epoch": 2.1298482293423273, + "grad_norm": 9.534473471390374, + "kl": 0.52734375, + "learning_rate": 5.747048903878583e-07, + "loss": 0.0006, + "reward": 3.028801918029785, + "reward_std": 0.12135305255651474, + "rewards/final_reward": 1.189611819827427, + "rewards/mask_iou_reward": 0.5948059099137135, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.028801828622818, + "rewards/thk_ans_format_reward": 1.0, + "step": 1261, + "think_completion_length": 46.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.203125, + "epoch": 2.1315345699831365, + "grad_norm": 6.395713232763334, + "kl": 0.509765625, + "learning_rate": 5.743676222596965e-07, + "loss": 0.0005, + "reward": 3.2245901823043823, + "reward_std": 0.10203266330063343, + "rewards/final_reward": 1.3326993714609847, + "rewards/mask_iou_reward": 0.6663496857304924, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.224590003490448, + "rewards/thk_ans_format_reward": 1.0, + "step": 1262, + "think_completion_length": 44.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.40625, + "epoch": 2.133220910623946, + "grad_norm": 9.683686130868914, + "kl": 1.009765625, + "learning_rate": 5.740303541315346e-07, + "loss": 0.001, + "reward": 2.9035420417785645, + "reward_std": 0.2598446160554886, + "rewards/final_reward": 0.7311843953140473, + "rewards/mask_iou_reward": 0.36559219765702367, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9035422205924988, + "rewards/thk_ans_format_reward": 1.0, + "step": 1263, + "think_completion_length": 48.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.234375, + "epoch": 2.1349072512647553, + "grad_norm": 16.42792138501997, + "kl": 0.4375, + "learning_rate": 5.736930860033726e-07, + "loss": 0.0004, + "reward": 3.252814292907715, + "reward_std": 0.07177349179983139, + "rewards/final_reward": 1.13756073486116, + "rewards/mask_iou_reward": 0.56878036743058, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2528142929077148, + "rewards/thk_ans_format_reward": 1.0, + "step": 1264, + "think_completion_length": 42.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.09375, + "epoch": 2.136593591905565, + "grad_norm": 16.85648174584554, + "kl": 0.517578125, + "learning_rate": 5.733558178752108e-07, + "loss": 0.0005, + "reward": 3.448030710220337, + "reward_std": 0.1371849738061428, + "rewards/final_reward": 1.4135558012866825, + "rewards/mask_iou_reward": 0.7067779006433412, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4480305314064026, + "rewards/thk_ans_format_reward": 1.0, + "step": 1265, + "think_completion_length": 45.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.09375, + "epoch": 2.138279932546374, + "grad_norm": 6.680793787636658, + "kl": 0.564453125, + "learning_rate": 5.730185497470489e-07, + "loss": 0.0004, + "reward": 3.5273772478103638, + "reward_std": 0.011883015278726816, + "rewards/final_reward": 1.1562473751345796, + "rewards/mask_iou_reward": 0.5781236875672898, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5273773074150085, + "rewards/thk_ans_format_reward": 1.0, + "step": 1266, + "think_completion_length": 48.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.515625, + "epoch": 2.139966273187184, + "grad_norm": 8.837263046519144, + "kl": 0.5380859375, + "learning_rate": 5.72681281618887e-07, + "loss": 0.0005, + "reward": 3.2997041940689087, + "reward_std": 0.49166443943977356, + "rewards/final_reward": 1.1886923108384135, + "rewards/mask_iou_reward": 0.5943461554192068, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2997040748596191, + "rewards/thk_ans_format_reward": 1.0, + "step": 1267, + "think_completion_length": 41.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.40625, + "epoch": 2.1416526138279934, + "grad_norm": 9.282787614222825, + "kl": 0.505859375, + "learning_rate": 5.723440134907252e-07, + "loss": 0.0005, + "reward": 3.345510482788086, + "reward_std": 0.002183900447562337, + "rewards/final_reward": 0.9688242810184894, + "rewards/mask_iou_reward": 0.4844121405092447, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3455105423927307, + "rewards/thk_ans_format_reward": 1.0, + "step": 1268, + "think_completion_length": 44.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.84375, + "epoch": 2.1433389544688026, + "grad_norm": 9.295865669291606, + "kl": 0.501953125, + "learning_rate": 5.720067453625632e-07, + "loss": 0.0005, + "reward": 3.7576842308044434, + "reward_std": 0.04031405784189701, + "rewards/final_reward": 1.6845229352375926, + "rewards/mask_iou_reward": 0.8422614676187963, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7576842308044434, + "rewards/thk_ans_format_reward": 1.0, + "step": 1269, + "think_completion_length": 46.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.15625, + "epoch": 2.1450252951096123, + "grad_norm": 11.174241717616878, + "kl": 0.484375, + "learning_rate": 5.716694772344013e-07, + "loss": 0.0004, + "reward": 3.5124897956848145, + "reward_std": 0.03136880323290825, + "rewards/final_reward": 1.4931899922318204, + "rewards/mask_iou_reward": 0.7465949961159102, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5124897956848145, + "rewards/thk_ans_format_reward": 1.0, + "step": 1270, + "think_completion_length": 47.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.5625, + "epoch": 2.1467116357504215, + "grad_norm": 15.755950032963167, + "kl": 0.484375, + "learning_rate": 5.713322091062395e-07, + "loss": 0.0005, + "reward": 3.5956294536590576, + "reward_std": 0.09438778925687075, + "rewards/final_reward": 1.4855681345253289, + "rewards/mask_iou_reward": 0.7427840672626644, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5956295132637024, + "rewards/thk_ans_format_reward": 1.0, + "step": 1271, + "think_completion_length": 47.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.390625, + "epoch": 2.148397976391231, + "grad_norm": 5.144881924672007, + "kl": 0.52734375, + "learning_rate": 5.709949409780775e-07, + "loss": 0.0005, + "reward": 3.665688157081604, + "reward_std": 0.10815348476171494, + "rewards/final_reward": 1.5873789454257476, + "rewards/mask_iou_reward": 0.7936894727128738, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6656880974769592, + "rewards/thk_ans_format_reward": 1.0, + "step": 1272, + "think_completion_length": 44.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.90625, + "epoch": 2.1500843170320403, + "grad_norm": 30.158297132887448, + "kl": 0.53125, + "learning_rate": 5.706576728499156e-07, + "loss": 0.0005, + "reward": 3.2382161617279053, + "reward_std": 0.06967388093471527, + "rewards/final_reward": 1.1723690364705708, + "rewards/mask_iou_reward": 0.5861845182352854, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2382160425186157, + "rewards/thk_ans_format_reward": 1.0, + "step": 1273, + "think_completion_length": 42.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.84375, + "epoch": 2.15177065767285, + "grad_norm": 7.935702624105307, + "kl": 0.509765625, + "learning_rate": 5.703204047217538e-07, + "loss": 0.0005, + "reward": 3.0919833183288574, + "reward_std": 0.049351561814546585, + "rewards/final_reward": 1.068108735833558, + "rewards/mask_iou_reward": 0.534054367916779, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0919832587242126, + "rewards/thk_ans_format_reward": 1.0, + "step": 1274, + "think_completion_length": 38.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.40625, + "epoch": 2.1534569983136596, + "grad_norm": 14.373076999773213, + "kl": 0.4970703125, + "learning_rate": 5.699831365935919e-07, + "loss": 0.0005, + "reward": 3.308625102043152, + "reward_std": 0.17064137011766434, + "rewards/final_reward": 0.8674972316005486, + "rewards/mask_iou_reward": 0.4337486158002743, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3086249232292175, + "rewards/thk_ans_format_reward": 1.0, + "step": 1275, + "think_completion_length": 41.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.890625, + "epoch": 2.1551433389544687, + "grad_norm": 6.085309378106807, + "kl": 0.474609375, + "learning_rate": 5.6964586846543e-07, + "loss": 0.0005, + "reward": 3.5639824867248535, + "reward_std": 0.03305263817310333, + "rewards/final_reward": 1.417472939633353, + "rewards/mask_iou_reward": 0.7087364698166765, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.563982605934143, + "rewards/thk_ans_format_reward": 1.0, + "step": 1276, + "think_completion_length": 43.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.25, + "epoch": 2.1568296795952784, + "grad_norm": 17.903092457178236, + "kl": 0.52734375, + "learning_rate": 5.693086003372681e-07, + "loss": 0.0005, + "reward": 3.7228747606277466, + "reward_std": 0.13351622968912125, + "rewards/final_reward": 1.6980407653162277, + "rewards/mask_iou_reward": 0.8490203826581139, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7228747606277466, + "rewards/thk_ans_format_reward": 1.0, + "step": 1277, + "think_completion_length": 42.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.640625, + "epoch": 2.1585160202360876, + "grad_norm": 9.2663164314058, + "kl": 0.501953125, + "learning_rate": 5.689713322091062e-07, + "loss": 0.0005, + "reward": 3.1014504432678223, + "reward_std": 0.09565165266394615, + "rewards/final_reward": 1.5953974965658613, + "rewards/mask_iou_reward": 0.7976987482829306, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1014504432678223, + "rewards/thk_ans_format_reward": 1.0, + "step": 1278, + "think_completion_length": 40.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.515625, + "epoch": 2.160202360876897, + "grad_norm": 5.838692379118382, + "kl": 0.529296875, + "learning_rate": 5.686340640809443e-07, + "loss": 0.0005, + "reward": 3.6012667417526245, + "reward_std": 0.011416994035243988, + "rewards/final_reward": 1.5336660571261627, + "rewards/mask_iou_reward": 0.7668330285630813, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6012668013572693, + "rewards/thk_ans_format_reward": 1.0, + "step": 1279, + "think_completion_length": 41.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.875, + "epoch": 2.1618887015177064, + "grad_norm": 17.41323672321895, + "kl": 0.513671875, + "learning_rate": 5.682967959527824e-07, + "loss": 0.0005, + "reward": 3.361006498336792, + "reward_std": 0.034786591306328773, + "rewards/final_reward": 1.9065231356008847, + "rewards/mask_iou_reward": 0.9532615678004424, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3610064387321472, + "rewards/thk_ans_format_reward": 1.0, + "step": 1280, + "think_completion_length": 40.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.703125, + "epoch": 2.163575042158516, + "grad_norm": 5.0781830994598085, + "kl": 0.5234375, + "learning_rate": 5.679595278246205e-07, + "loss": 0.0005, + "reward": 3.4662747383117676, + "reward_std": 0.2071358636021614, + "rewards/final_reward": 1.8022344144039115, + "rewards/mask_iou_reward": 0.9011172072019558, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4662747383117676, + "rewards/thk_ans_format_reward": 1.0, + "step": 1281, + "think_completion_length": 40.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.46875, + "epoch": 2.1652613827993257, + "grad_norm": 5.313808569171344, + "kl": 0.51953125, + "learning_rate": 5.676222596964586e-07, + "loss": 0.0005, + "reward": 3.129016160964966, + "reward_std": 0.12660411186516285, + "rewards/final_reward": 1.2375794669776226, + "rewards/mask_iou_reward": 0.6187897334888113, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1290162205696106, + "rewards/thk_ans_format_reward": 1.0, + "step": 1282, + "think_completion_length": 39.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.0625, + "epoch": 2.166947723440135, + "grad_norm": 8.237313862885776, + "kl": 0.4453125, + "learning_rate": 5.672849915682968e-07, + "loss": 0.0004, + "reward": 3.0471763610839844, + "reward_std": 0.3178776204586029, + "rewards/final_reward": 0.6561711341302512, + "rewards/mask_iou_reward": 0.3280855670651256, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0471762418746948, + "rewards/thk_ans_format_reward": 1.0, + "step": 1283, + "think_completion_length": 41.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.78125, + "epoch": 2.1686340640809445, + "grad_norm": 5.8652658890448865, + "kl": 0.4482421875, + "learning_rate": 5.669477234401349e-07, + "loss": 0.0005, + "reward": 3.6685824394226074, + "reward_std": 0.06685462407767773, + "rewards/final_reward": 1.8848773448353418, + "rewards/mask_iou_reward": 0.9424386724176709, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6685824990272522, + "rewards/thk_ans_format_reward": 1.0, + "step": 1284, + "think_completion_length": 36.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.0625, + "epoch": 2.1703204047217537, + "grad_norm": 5.837632353873461, + "kl": 0.5546875, + "learning_rate": 5.666104553119731e-07, + "loss": 0.0005, + "reward": 3.229220151901245, + "reward_std": 0.02632999565685168, + "rewards/final_reward": 1.788056331911195, + "rewards/mask_iou_reward": 0.8940281659555975, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2292202413082123, + "rewards/thk_ans_format_reward": 1.0, + "step": 1285, + "think_completion_length": 38.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.015625, + "epoch": 2.1720067453625633, + "grad_norm": 4.7995000021626115, + "kl": 0.49609375, + "learning_rate": 5.662731871838111e-07, + "loss": 0.0005, + "reward": 3.326902389526367, + "reward_std": 0.07282107695937157, + "rewards/final_reward": 0.9955913918744543, + "rewards/mask_iou_reward": 0.49779569593722717, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3269023895263672, + "rewards/thk_ans_format_reward": 1.0, + "step": 1286, + "think_completion_length": 36.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.890625, + "epoch": 2.1736930860033725, + "grad_norm": 10.813729699788505, + "kl": 0.53515625, + "learning_rate": 5.659359190556492e-07, + "loss": 0.0005, + "reward": 3.4771808385849, + "reward_std": 0.19824712723493576, + "rewards/final_reward": 1.4848713419331876, + "rewards/mask_iou_reward": 0.7424356709665938, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4771807789802551, + "rewards/thk_ans_format_reward": 1.0, + "step": 1287, + "think_completion_length": 37.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.0625, + "epoch": 2.175379426644182, + "grad_norm": 11.319448877957019, + "kl": 0.513671875, + "learning_rate": 5.655986509274874e-07, + "loss": 0.0005, + "reward": 2.9048445224761963, + "reward_std": 0.06361216679215431, + "rewards/final_reward": 0.8371745185363462, + "rewards/mask_iou_reward": 0.4185872592681731, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9048446416854858, + "rewards/thk_ans_format_reward": 1.0, + "step": 1288, + "think_completion_length": 41.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.671875, + "epoch": 2.177065767284992, + "grad_norm": 12.218867399167939, + "kl": 0.5546875, + "learning_rate": 5.652613827993254e-07, + "loss": 0.0006, + "reward": 3.2906709909439087, + "reward_std": 0.03546382300555706, + "rewards/final_reward": 0.9750635697684437, + "rewards/mask_iou_reward": 0.48753178488422183, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2906709909439087, + "rewards/thk_ans_format_reward": 1.0, + "step": 1289, + "think_completion_length": 31.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.875, + "epoch": 2.178752107925801, + "grad_norm": 12.713163442429922, + "kl": 0.46484375, + "learning_rate": 5.649241146711635e-07, + "loss": 0.0005, + "reward": 3.008612871170044, + "reward_std": 0.2824648320674896, + "rewards/final_reward": 0.9454986306429427, + "rewards/mask_iou_reward": 0.47274931532147135, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.008612722158432, + "rewards/thk_ans_format_reward": 1.0, + "step": 1290, + "think_completion_length": 40.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.03125, + "epoch": 2.1804384485666106, + "grad_norm": 26.08829645776549, + "kl": 0.51171875, + "learning_rate": 5.645868465430017e-07, + "loss": 0.0005, + "reward": 3.0946640968322754, + "reward_std": 0.3881853371858597, + "rewards/final_reward": 1.1850956641725252, + "rewards/mask_iou_reward": 0.5925478320862626, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.094664067029953, + "rewards/thk_ans_format_reward": 1.0, + "step": 1291, + "think_completion_length": 39.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.34375, + "epoch": 2.18212478920742, + "grad_norm": 8.58694853252235, + "kl": 0.5859375, + "learning_rate": 5.642495784148398e-07, + "loss": 0.0006, + "reward": 3.3137210607528687, + "reward_std": 0.0238959863781929, + "rewards/final_reward": 0.9728564177071632, + "rewards/mask_iou_reward": 0.4864282088535816, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3137210607528687, + "rewards/thk_ans_format_reward": 1.0, + "step": 1292, + "think_completion_length": 45.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.6875, + "epoch": 2.1838111298482294, + "grad_norm": 9.334431581768936, + "kl": 0.59765625, + "learning_rate": 5.639123102866779e-07, + "loss": 0.0006, + "reward": 3.366030693054199, + "reward_std": 0.25199363101273775, + "rewards/final_reward": 1.5338307014555888, + "rewards/mask_iou_reward": 0.7669153507277944, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3660306930541992, + "rewards/thk_ans_format_reward": 1.0, + "step": 1293, + "think_completion_length": 43.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.4375, + "epoch": 2.1854974704890386, + "grad_norm": 7.261057150063994, + "kl": 0.5703125, + "learning_rate": 5.635750421585161e-07, + "loss": 0.0006, + "reward": 3.1913743019104004, + "reward_std": 0.09568795189261436, + "rewards/final_reward": 0.8999432986526881, + "rewards/mask_iou_reward": 0.44997164932634404, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1913742423057556, + "rewards/thk_ans_format_reward": 1.0, + "step": 1294, + "think_completion_length": 38.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.9375, + "epoch": 2.1871838111298483, + "grad_norm": 11.890263919453417, + "kl": 0.537109375, + "learning_rate": 5.632377740303541e-07, + "loss": 0.0005, + "reward": 3.1532232761383057, + "reward_std": 0.568440705537796, + "rewards/final_reward": 1.0693202949044516, + "rewards/mask_iou_reward": 0.5346601474522258, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1532232761383057, + "rewards/thk_ans_format_reward": 1.0, + "step": 1295, + "think_completion_length": 33.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.65625, + "epoch": 2.1888701517706575, + "grad_norm": 29.777276960107326, + "kl": 0.564453125, + "learning_rate": 5.629005059021922e-07, + "loss": 0.0006, + "reward": 3.4506603479385376, + "reward_std": 0.21746337413787842, + "rewards/final_reward": 1.3186164815143446, + "rewards/mask_iou_reward": 0.6593082407571723, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4506604075431824, + "rewards/thk_ans_format_reward": 1.0, + "step": 1296, + "think_completion_length": 34.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.875, + "epoch": 2.190556492411467, + "grad_norm": 10.137710080819277, + "kl": 0.515625, + "learning_rate": 5.625632377740303e-07, + "loss": 0.0005, + "reward": 3.43321430683136, + "reward_std": 0.18915096670389175, + "rewards/final_reward": 1.2422396347537354, + "rewards/mask_iou_reward": 0.6211198173768677, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4332143068313599, + "rewards/thk_ans_format_reward": 1.0, + "step": 1297, + "think_completion_length": 32.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.765625, + "epoch": 2.1922428330522767, + "grad_norm": 8.844318257916903, + "kl": 0.5546875, + "learning_rate": 5.622259696458684e-07, + "loss": 0.0005, + "reward": 3.1780149936676025, + "reward_std": 0.08533276757225394, + "rewards/final_reward": 0.7760199678805226, + "rewards/mask_iou_reward": 0.3880099839402613, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1780150532722473, + "rewards/thk_ans_format_reward": 1.0, + "step": 1298, + "think_completion_length": 36.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.828125, + "epoch": 2.193929173693086, + "grad_norm": 14.424263342320263, + "kl": 0.53125, + "learning_rate": 5.618887015177065e-07, + "loss": 0.0005, + "reward": 3.1475911140441895, + "reward_std": 0.3342314139008522, + "rewards/final_reward": 1.5412919740557918, + "rewards/mask_iou_reward": 0.7706459870278959, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.147591084241867, + "rewards/thk_ans_format_reward": 1.0, + "step": 1299, + "think_completion_length": 36.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.578125, + "epoch": 2.1956155143338956, + "grad_norm": 169.85177106817954, + "kl": 0.458984375, + "learning_rate": 5.615514333895447e-07, + "loss": 0.0005, + "reward": 3.320096254348755, + "reward_std": 0.22164292633533478, + "rewards/final_reward": 1.459633178376593, + "rewards/mask_iou_reward": 0.7298165891882965, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3200964331626892, + "rewards/thk_ans_format_reward": 1.0, + "step": 1300, + "think_completion_length": 36.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.921875, + "epoch": 2.1973018549747048, + "grad_norm": 9.054714053372589, + "kl": 0.49609375, + "learning_rate": 5.612141652613828e-07, + "loss": 0.0005, + "reward": 3.2595558166503906, + "reward_std": 0.3579244762659073, + "rewards/final_reward": 1.6795588356267266, + "rewards/mask_iou_reward": 0.8397794178133633, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2595559358596802, + "rewards/thk_ans_format_reward": 1.0, + "step": 1301, + "think_completion_length": 39.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.90625, + "epoch": 2.1989881956155144, + "grad_norm": 7.766385435361891, + "kl": 0.529296875, + "learning_rate": 5.608768971332209e-07, + "loss": 0.0005, + "reward": 3.2027809619903564, + "reward_std": 0.13372624665498734, + "rewards/final_reward": 0.9463063106918193, + "rewards/mask_iou_reward": 0.4731531553459096, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2027809023857117, + "rewards/thk_ans_format_reward": 1.0, + "step": 1302, + "think_completion_length": 34.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.53125, + "epoch": 2.2006745362563236, + "grad_norm": 17.516372215159315, + "kl": 0.58984375, + "learning_rate": 5.605396290050591e-07, + "loss": 0.0006, + "reward": 3.0270960330963135, + "reward_std": 0.1305270530283451, + "rewards/final_reward": 0.9893757757420829, + "rewards/mask_iou_reward": 0.49468788787104145, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0270961076021194, + "rewards/thk_ans_format_reward": 1.0, + "step": 1303, + "think_completion_length": 34.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.953125, + "epoch": 2.2023608768971332, + "grad_norm": 9.152942586295097, + "kl": 0.494140625, + "learning_rate": 5.602023608768971e-07, + "loss": 0.0005, + "reward": 3.0987539291381836, + "reward_std": 0.21952814608812332, + "rewards/final_reward": 1.2299166486700184, + "rewards/mask_iou_reward": 0.6149583243350092, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0987539291381836, + "rewards/thk_ans_format_reward": 1.0, + "step": 1304, + "think_completion_length": 41.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.28125, + "epoch": 2.204047217537943, + "grad_norm": 10.836055739499022, + "kl": 0.55859375, + "learning_rate": 5.598650927487351e-07, + "loss": 0.0006, + "reward": 3.4143790006637573, + "reward_std": 0.2662373185157776, + "rewards/final_reward": 1.3339210672457629, + "rewards/mask_iou_reward": 0.6669605336228814, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4143790006637573, + "rewards/thk_ans_format_reward": 1.0, + "step": 1305, + "think_completion_length": 35.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.140625, + "epoch": 2.205733558178752, + "grad_norm": 8.93654538137696, + "kl": 0.548828125, + "learning_rate": 5.595278246205733e-07, + "loss": 0.0005, + "reward": 3.10241162776947, + "reward_std": 0.39101406559348106, + "rewards/final_reward": 0.8648975132692235, + "rewards/mask_iou_reward": 0.43244875663461174, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1024115979671478, + "rewards/thk_ans_format_reward": 1.0, + "step": 1306, + "think_completion_length": 35.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.265625, + "epoch": 2.2074198988195617, + "grad_norm": 15.25434670003201, + "kl": 0.54296875, + "learning_rate": 5.591905564924114e-07, + "loss": 0.0005, + "reward": 3.0681989192962646, + "reward_std": 0.301235631108284, + "rewards/final_reward": 0.7502557477642079, + "rewards/mask_iou_reward": 0.37512787388210395, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0681988894939423, + "rewards/thk_ans_format_reward": 1.0, + "step": 1307, + "think_completion_length": 40.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.46875, + "epoch": 2.209106239460371, + "grad_norm": 17.605437350876464, + "kl": 0.4931640625, + "learning_rate": 5.588532883642495e-07, + "loss": 0.0005, + "reward": 3.771380066871643, + "reward_std": 0.11449036654084921, + "rewards/final_reward": 1.781286348007805, + "rewards/mask_iou_reward": 0.8906431740039025, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.771379828453064, + "rewards/thk_ans_format_reward": 1.0, + "step": 1308, + "think_completion_length": 42.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.234375, + "epoch": 2.2107925801011805, + "grad_norm": 14.745100662483619, + "kl": 0.541015625, + "learning_rate": 5.585160202360877e-07, + "loss": 0.0006, + "reward": 3.74048388004303, + "reward_std": 0.01679701777175069, + "rewards/final_reward": 1.5908809315176096, + "rewards/mask_iou_reward": 0.7954404657588048, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.740483820438385, + "rewards/thk_ans_format_reward": 1.0, + "step": 1309, + "think_completion_length": 41.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.75, + "epoch": 2.2124789207419897, + "grad_norm": 5.431669802578124, + "kl": 0.5029296875, + "learning_rate": 5.581787521079258e-07, + "loss": 0.0005, + "reward": 3.6262569427490234, + "reward_std": 0.1467602625489235, + "rewards/final_reward": 1.7593725993227114, + "rewards/mask_iou_reward": 0.8796862996613557, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6262570023536682, + "rewards/thk_ans_format_reward": 1.0, + "step": 1310, + "think_completion_length": 36.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.578125, + "epoch": 2.2141652613827993, + "grad_norm": 6.749141833190477, + "kl": 0.5625, + "learning_rate": 5.57841483979764e-07, + "loss": 0.0006, + "reward": 3.4961687326431274, + "reward_std": 0.1268460345454514, + "rewards/final_reward": 1.882680523286068, + "rewards/mask_iou_reward": 0.941340261643034, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4961687326431274, + "rewards/thk_ans_format_reward": 1.0, + "step": 1311, + "think_completion_length": 38.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.59375, + "epoch": 2.2158516020236085, + "grad_norm": 13.437066869204576, + "kl": 0.50390625, + "learning_rate": 5.575042158516021e-07, + "loss": 0.0005, + "reward": 3.282013177871704, + "reward_std": 0.3590443627908826, + "rewards/final_reward": 1.405391687345856, + "rewards/mask_iou_reward": 0.702695843672928, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.282013177871704, + "rewards/thk_ans_format_reward": 1.0, + "step": 1312, + "think_completion_length": 42.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.484375, + "epoch": 2.217537942664418, + "grad_norm": 6.271928895012018, + "kl": 0.58984375, + "learning_rate": 5.5716694772344e-07, + "loss": 0.0006, + "reward": 3.3246023654937744, + "reward_std": 0.1027023196220398, + "rewards/final_reward": 0.9758942077909897, + "rewards/mask_iou_reward": 0.48794710389549484, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.324602335691452, + "rewards/thk_ans_format_reward": 1.0, + "step": 1313, + "think_completion_length": 35.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.828125, + "epoch": 2.219224283305228, + "grad_norm": 13.85784093229938, + "kl": 0.546875, + "learning_rate": 5.568296795952782e-07, + "loss": 0.0005, + "reward": 3.347952723503113, + "reward_std": 0.1474056839942932, + "rewards/final_reward": 1.0762541446075893, + "rewards/mask_iou_reward": 0.5381270723037946, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3479526042938232, + "rewards/thk_ans_format_reward": 1.0, + "step": 1314, + "think_completion_length": 40.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.609375, + "epoch": 2.220910623946037, + "grad_norm": 24.114698680934087, + "kl": 0.60546875, + "learning_rate": 5.564924114671163e-07, + "loss": 0.0006, + "reward": 3.19465708732605, + "reward_std": 0.1294799353927374, + "rewards/final_reward": 1.13020496978562, + "rewards/mask_iou_reward": 0.56510248489281, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1946569681167603, + "rewards/thk_ans_format_reward": 1.0, + "step": 1315, + "think_completion_length": 37.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.859375, + "epoch": 2.2225969645868466, + "grad_norm": 40.02479739699717, + "kl": 0.486328125, + "learning_rate": 5.561551433389544e-07, + "loss": 0.0005, + "reward": 3.1586406230926514, + "reward_std": 0.19064200669527054, + "rewards/final_reward": 0.9896240409602192, + "rewards/mask_iou_reward": 0.4948120204801096, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1586405336856842, + "rewards/thk_ans_format_reward": 1.0, + "step": 1316, + "think_completion_length": 36.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.46875, + "epoch": 2.224283305227656, + "grad_norm": 6.046357665038441, + "kl": 0.5341796875, + "learning_rate": 5.558178752107926e-07, + "loss": 0.0005, + "reward": 2.572770595550537, + "reward_std": 0.11222269444260746, + "rewards/final_reward": 0.11813676432442753, + "rewards/mask_iou_reward": 0.059068382162213766, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5727706551551819, + "rewards/thk_ans_format_reward": 1.0, + "step": 1317, + "think_completion_length": 41.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.203125, + "epoch": 2.2259696458684655, + "grad_norm": 11.749521563087038, + "kl": 0.4873046875, + "learning_rate": 5.554806070826307e-07, + "loss": 0.0005, + "reward": 3.731950521469116, + "reward_std": 0.06127586215734482, + "rewards/final_reward": 1.704703487748192, + "rewards/mask_iou_reward": 0.852351743874096, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.731950581073761, + "rewards/thk_ans_format_reward": 1.0, + "step": 1318, + "think_completion_length": 34.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.15625, + "epoch": 2.2276559865092747, + "grad_norm": 9.448524216182863, + "kl": 0.55078125, + "learning_rate": 5.551433389544688e-07, + "loss": 0.0006, + "reward": 3.561526656150818, + "reward_std": 0.017425385303795338, + "rewards/final_reward": 1.835540433113184, + "rewards/mask_iou_reward": 0.917770216556592, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5615267157554626, + "rewards/thk_ans_format_reward": 1.0, + "step": 1319, + "think_completion_length": 31.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.046875, + "epoch": 2.2293423271500843, + "grad_norm": 17.403947483194223, + "kl": 0.576171875, + "learning_rate": 5.54806070826307e-07, + "loss": 0.0006, + "reward": 3.371973991394043, + "reward_std": 0.29090772196650505, + "rewards/final_reward": 0.8479355704681258, + "rewards/mask_iou_reward": 0.4239677852340629, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3719740509986877, + "rewards/thk_ans_format_reward": 1.0, + "step": 1320, + "think_completion_length": 39.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.59375, + "epoch": 2.231028667790894, + "grad_norm": 5.326169338549298, + "kl": 0.53125, + "learning_rate": 5.544688026981449e-07, + "loss": 0.0005, + "reward": 3.568778872489929, + "reward_std": 0.3622869700193405, + "rewards/final_reward": 1.4941932238839515, + "rewards/mask_iou_reward": 0.7470966119419757, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5687788724899292, + "rewards/thk_ans_format_reward": 1.0, + "step": 1321, + "think_completion_length": 38.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.0625, + "epoch": 2.232715008431703, + "grad_norm": 14.990765910375735, + "kl": 0.541015625, + "learning_rate": 5.54131534569983e-07, + "loss": 0.0005, + "reward": 3.765368938446045, + "reward_std": 0.04102184996008873, + "rewards/final_reward": 1.6718389758842933, + "rewards/mask_iou_reward": 0.8359194879421467, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.765368938446045, + "rewards/thk_ans_format_reward": 1.0, + "step": 1322, + "think_completion_length": 32.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.953125, + "epoch": 2.2344013490725128, + "grad_norm": 9.527736157533576, + "kl": 0.5, + "learning_rate": 5.537942664418212e-07, + "loss": 0.0005, + "reward": 3.199765205383301, + "reward_std": 0.1792975813150406, + "rewards/final_reward": 1.5867222237886813, + "rewards/mask_iou_reward": 0.7933611118943407, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1997651159763336, + "rewards/thk_ans_format_reward": 1.0, + "step": 1323, + "think_completion_length": 35.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.046875, + "epoch": 2.236087689713322, + "grad_norm": 11.753149753032291, + "kl": 0.486328125, + "learning_rate": 5.534569983136593e-07, + "loss": 0.0005, + "reward": 3.013986587524414, + "reward_std": 0.09028564766049385, + "rewards/final_reward": 1.502550902240183, + "rewards/mask_iou_reward": 0.7512754511200915, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.013986587524414, + "rewards/thk_ans_format_reward": 1.0, + "step": 1324, + "think_completion_length": 42.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.03125, + "epoch": 2.2377740303541316, + "grad_norm": 5.303122418256821, + "kl": 0.556640625, + "learning_rate": 5.531197301854974e-07, + "loss": 0.0006, + "reward": 3.7323914766311646, + "reward_std": 0.015024483669549227, + "rewards/final_reward": 1.5586951757322525, + "rewards/mask_iou_reward": 0.7793475878661262, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7323914170265198, + "rewards/thk_ans_format_reward": 1.0, + "step": 1325, + "think_completion_length": 37.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.546875, + "epoch": 2.2394603709949408, + "grad_norm": 6.777899205372352, + "kl": 0.537109375, + "learning_rate": 5.527824620573356e-07, + "loss": 0.0005, + "reward": 3.7131404876708984, + "reward_std": 0.25278275832533836, + "rewards/final_reward": 1.6623295700856646, + "rewards/mask_iou_reward": 0.8311647850428323, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7131404876708984, + "rewards/thk_ans_format_reward": 1.0, + "step": 1326, + "think_completion_length": 33.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.40625, + "epoch": 2.2411467116357504, + "grad_norm": 20.40708241683717, + "kl": 0.55859375, + "learning_rate": 5.524451939291737e-07, + "loss": 0.0006, + "reward": 3.3399040699005127, + "reward_std": 0.1821693703532219, + "rewards/final_reward": 1.5421490310798234, + "rewards/mask_iou_reward": 0.7710745155399117, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3399040699005127, + "rewards/thk_ans_format_reward": 1.0, + "step": 1327, + "think_completion_length": 38.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.46875, + "epoch": 2.24283305227656, + "grad_norm": 7.464542519723734, + "kl": 0.580078125, + "learning_rate": 5.521079258010118e-07, + "loss": 0.0006, + "reward": 3.447382092475891, + "reward_std": 0.2822858989238739, + "rewards/final_reward": 1.7969582167403346, + "rewards/mask_iou_reward": 0.8984791083701673, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4473822116851807, + "rewards/thk_ans_format_reward": 1.0, + "step": 1328, + "think_completion_length": 35.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.890625, + "epoch": 2.2445193929173692, + "grad_norm": 6.307277165051091, + "kl": 0.5390625, + "learning_rate": 5.5177065767285e-07, + "loss": 0.0005, + "reward": 2.7995030879974365, + "reward_std": 0.02789947483688593, + "rewards/final_reward": 0.12901411624845935, + "rewards/mask_iou_reward": 0.06450705812422967, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7995031476020813, + "rewards/thk_ans_format_reward": 1.0, + "step": 1329, + "think_completion_length": 43.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.96875, + "epoch": 2.246205733558179, + "grad_norm": 24.16342895469795, + "kl": 0.501953125, + "learning_rate": 5.514333895446879e-07, + "loss": 0.0005, + "reward": 3.5670766830444336, + "reward_std": 0.13270878046751022, + "rewards/final_reward": 1.3634549278573478, + "rewards/mask_iou_reward": 0.6817274639286739, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.567076563835144, + "rewards/thk_ans_format_reward": 1.0, + "step": 1330, + "think_completion_length": 37.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.359375, + "epoch": 2.247892074198988, + "grad_norm": 16.82568910254876, + "kl": 0.541015625, + "learning_rate": 5.51096121416526e-07, + "loss": 0.0005, + "reward": 3.664430260658264, + "reward_std": 0.09643076732754707, + "rewards/final_reward": 1.8536971501973039, + "rewards/mask_iou_reward": 0.9268485750986519, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6644301414489746, + "rewards/thk_ans_format_reward": 1.0, + "step": 1331, + "think_completion_length": 36.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.875, + "epoch": 2.2495784148397977, + "grad_norm": 18.817463938613194, + "kl": 0.56640625, + "learning_rate": 5.507588532883642e-07, + "loss": 0.0006, + "reward": 3.710629940032959, + "reward_std": 0.06464794278144836, + "rewards/final_reward": 1.9061641571540149, + "rewards/mask_iou_reward": 0.9530820785770074, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.710629940032959, + "rewards/thk_ans_format_reward": 1.0, + "step": 1332, + "think_completion_length": 40.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.65625, + "epoch": 2.251264755480607, + "grad_norm": 11.400527521207344, + "kl": 0.677734375, + "learning_rate": 5.504215851602023e-07, + "loss": 0.0007, + "reward": 3.312086343765259, + "reward_std": 0.20460295677185059, + "rewards/final_reward": 1.68854368302632, + "rewards/mask_iou_reward": 0.84427184151316, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3120863437652588, + "rewards/thk_ans_format_reward": 1.0, + "step": 1333, + "think_completion_length": 41.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.515625, + "epoch": 2.2529510961214165, + "grad_norm": 21.22683690022595, + "kl": 0.5078125, + "learning_rate": 5.500843170320405e-07, + "loss": 0.0005, + "reward": 3.294277548789978, + "reward_std": 0.022508380352519453, + "rewards/final_reward": 1.325478289544912, + "rewards/mask_iou_reward": 0.662739144772456, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.294277548789978, + "rewards/thk_ans_format_reward": 1.0, + "step": 1334, + "think_completion_length": 39.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.515625, + "epoch": 2.254637436762226, + "grad_norm": 6.630398678310001, + "kl": 0.60546875, + "learning_rate": 5.497470489038786e-07, + "loss": 0.0006, + "reward": 3.027361512184143, + "reward_std": 0.14319058507680893, + "rewards/final_reward": 1.07448501438753, + "rewards/mask_iou_reward": 0.537242507193765, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0273613929748535, + "rewards/thk_ans_format_reward": 1.0, + "step": 1335, + "think_completion_length": 37.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.234375, + "epoch": 2.2563237774030354, + "grad_norm": 5.367508776552314, + "kl": 0.546875, + "learning_rate": 5.494097807757167e-07, + "loss": 0.0005, + "reward": 3.3002820014953613, + "reward_std": 0.17817065119743347, + "rewards/final_reward": 1.2256184729200479, + "rewards/mask_iou_reward": 0.6128092364600239, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.300282061100006, + "rewards/thk_ans_format_reward": 1.0, + "step": 1336, + "think_completion_length": 38.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.515625, + "epoch": 2.258010118043845, + "grad_norm": 11.956116301646125, + "kl": 0.48046875, + "learning_rate": 5.490725126475549e-07, + "loss": 0.0005, + "reward": 3.174812436103821, + "reward_std": 0.3088492304086685, + "rewards/final_reward": 1.0705311364499366, + "rewards/mask_iou_reward": 0.5352655682249683, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.174812376499176, + "rewards/thk_ans_format_reward": 1.0, + "step": 1337, + "think_completion_length": 39.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.6875, + "epoch": 2.259696458684654, + "grad_norm": 9.85948051241923, + "kl": 0.53515625, + "learning_rate": 5.487352445193929e-07, + "loss": 0.0005, + "reward": 3.329120635986328, + "reward_std": 0.1026376448571682, + "rewards/final_reward": 1.8027696973468261, + "rewards/mask_iou_reward": 0.9013848486734131, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3291206359863281, + "rewards/thk_ans_format_reward": 1.0, + "step": 1338, + "think_completion_length": 36.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.46875, + "epoch": 2.261382799325464, + "grad_norm": 14.780010393420275, + "kl": 0.638671875, + "learning_rate": 5.483979763912309e-07, + "loss": 0.0006, + "reward": 3.4766530990600586, + "reward_std": 0.07312630349770188, + "rewards/final_reward": 1.3158479292079335, + "rewards/mask_iou_reward": 0.6579239646039667, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.476653277873993, + "rewards/thk_ans_format_reward": 1.0, + "step": 1339, + "think_completion_length": 40.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.1875, + "epoch": 2.263069139966273, + "grad_norm": 10.052671629886701, + "kl": 0.59375, + "learning_rate": 5.480607082630691e-07, + "loss": 0.0006, + "reward": 3.359123110771179, + "reward_std": 0.1407726462930441, + "rewards/final_reward": 1.7389720199769068, + "rewards/mask_iou_reward": 0.8694860099884534, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3591230809688568, + "rewards/thk_ans_format_reward": 1.0, + "step": 1340, + "think_completion_length": 40.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.953125, + "epoch": 2.2647554806070826, + "grad_norm": 9.111437858737153, + "kl": 0.68359375, + "learning_rate": 5.477234401349072e-07, + "loss": 0.0007, + "reward": 3.3291393518447876, + "reward_std": 0.13854620698839426, + "rewards/final_reward": 1.5084446019798792, + "rewards/mask_iou_reward": 0.7542223009899396, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3291394114494324, + "rewards/thk_ans_format_reward": 1.0, + "step": 1341, + "think_completion_length": 35.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.4375, + "epoch": 2.2664418212478923, + "grad_norm": 15.210744253675966, + "kl": 0.609375, + "learning_rate": 5.473861720067453e-07, + "loss": 0.0005, + "reward": 3.517295718193054, + "reward_std": 0.1796765811741352, + "rewards/final_reward": 1.1933201488455092, + "rewards/mask_iou_reward": 0.5966600744227546, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5172955989837646, + "rewards/thk_ans_format_reward": 1.0, + "step": 1342, + "think_completion_length": 34.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.578125, + "epoch": 2.2681281618887015, + "grad_norm": 13.824162919930119, + "kl": 0.5390625, + "learning_rate": 5.470489038785835e-07, + "loss": 0.0005, + "reward": 2.7883373498916626, + "reward_std": 0.13726412784308195, + "rewards/final_reward": 0.7340926898184925, + "rewards/mask_iou_reward": 0.36704634490924626, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7883373498916626, + "rewards/thk_ans_format_reward": 1.0, + "step": 1343, + "think_completion_length": 33.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.703125, + "epoch": 2.269814502529511, + "grad_norm": 10.862051768430439, + "kl": 0.521484375, + "learning_rate": 5.467116357504216e-07, + "loss": 0.0005, + "reward": 3.518467664718628, + "reward_std": 0.0765317790210247, + "rewards/final_reward": 1.8295472667170398, + "rewards/mask_iou_reward": 0.9147736333585199, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5184677839279175, + "rewards/thk_ans_format_reward": 1.0, + "step": 1344, + "think_completion_length": 38.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.9375, + "epoch": 2.2715008431703203, + "grad_norm": 18.923876243262153, + "kl": 0.640625, + "learning_rate": 5.463743676222597e-07, + "loss": 0.0006, + "reward": 3.4015763998031616, + "reward_std": 0.2154865264892578, + "rewards/final_reward": 1.243939497565763, + "rewards/mask_iou_reward": 0.6219697487828815, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4015763401985168, + "rewards/thk_ans_format_reward": 1.0, + "step": 1345, + "think_completion_length": 34.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.578125, + "epoch": 2.27318718381113, + "grad_norm": 10.46959866402287, + "kl": 0.69921875, + "learning_rate": 5.460370994940978e-07, + "loss": 0.0007, + "reward": 3.545522689819336, + "reward_std": 0.14559809491038322, + "rewards/final_reward": 1.663084276520829, + "rewards/mask_iou_reward": 0.8315421382604145, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5455228686332703, + "rewards/thk_ans_format_reward": 1.0, + "step": 1346, + "think_completion_length": 37.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.984375, + "epoch": 2.274873524451939, + "grad_norm": 13.535203472597521, + "kl": 0.5234375, + "learning_rate": 5.456998313659359e-07, + "loss": 0.0005, + "reward": 3.331748127937317, + "reward_std": 0.3237799145281315, + "rewards/final_reward": 1.1902394554738316, + "rewards/mask_iou_reward": 0.5951197277369158, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.331748127937317, + "rewards/thk_ans_format_reward": 1.0, + "step": 1347, + "think_completion_length": 33.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.984375, + "epoch": 2.2765598650927488, + "grad_norm": 8.446735116918521, + "kl": 0.5400390625, + "learning_rate": 5.453625632377739e-07, + "loss": 0.0005, + "reward": 3.732712984085083, + "reward_std": 0.03800155781209469, + "rewards/final_reward": 1.6502427498652197, + "rewards/mask_iou_reward": 0.8251213749326098, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7327128648757935, + "rewards/thk_ans_format_reward": 1.0, + "step": 1348, + "think_completion_length": 36.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.96875, + "epoch": 2.2782462057335584, + "grad_norm": 7.501407452838234, + "kl": 0.64453125, + "learning_rate": 5.450252951096121e-07, + "loss": 0.0006, + "reward": 2.9089930057525635, + "reward_std": 0.10850898921489716, + "rewards/final_reward": 0.8053101366110927, + "rewards/mask_iou_reward": 0.40265506830554637, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9089928865432739, + "rewards/thk_ans_format_reward": 1.0, + "step": 1349, + "think_completion_length": 37.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.125, + "epoch": 2.2799325463743676, + "grad_norm": 5.933995703982366, + "kl": 0.57421875, + "learning_rate": 5.446880269814502e-07, + "loss": 0.0006, + "reward": 3.5692362785339355, + "reward_std": 0.30243778228759766, + "rewards/final_reward": 1.56085562949582, + "rewards/mask_iou_reward": 0.78042781474791, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5692362189292908, + "rewards/thk_ans_format_reward": 1.0, + "step": 1350, + "think_completion_length": 37.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.71875, + "epoch": 2.2816188870151772, + "grad_norm": 16.075818615178704, + "kl": 0.576171875, + "learning_rate": 5.443507588532883e-07, + "loss": 0.0006, + "reward": 2.991969585418701, + "reward_std": 0.3233000710606575, + "rewards/final_reward": 1.320913964818911, + "rewards/mask_iou_reward": 0.6604569824094555, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9919696748256683, + "rewards/thk_ans_format_reward": 1.0, + "step": 1351, + "think_completion_length": 35.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.84375, + "epoch": 2.2833052276559864, + "grad_norm": 6.332802892368575, + "kl": 0.5859375, + "learning_rate": 5.440134907251265e-07, + "loss": 0.0006, + "reward": 3.4482442140579224, + "reward_std": 0.22082431614398956, + "rewards/final_reward": 1.2399281795165056, + "rewards/mask_iou_reward": 0.6199640897582528, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.448244333267212, + "rewards/thk_ans_format_reward": 1.0, + "step": 1352, + "think_completion_length": 34.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.53125, + "epoch": 2.284991568296796, + "grad_norm": 10.536058948852574, + "kl": 0.517578125, + "learning_rate": 5.436762225969646e-07, + "loss": 0.0005, + "reward": 3.4116357564926147, + "reward_std": 0.3409469872713089, + "rewards/final_reward": 1.5051610832476137, + "rewards/mask_iou_reward": 0.7525805416238068, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4116357564926147, + "rewards/thk_ans_format_reward": 1.0, + "step": 1353, + "think_completion_length": 36.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.234375, + "epoch": 2.2866779089376053, + "grad_norm": 8.984527740459715, + "kl": 0.54296875, + "learning_rate": 5.433389544688026e-07, + "loss": 0.0006, + "reward": 3.5945018529891968, + "reward_std": 0.08709852397441864, + "rewards/final_reward": 1.6458639418072991, + "rewards/mask_iou_reward": 0.8229319709036496, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5945017337799072, + "rewards/thk_ans_format_reward": 1.0, + "step": 1354, + "think_completion_length": 36.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.859375, + "epoch": 2.288364249578415, + "grad_norm": 10.983386192174155, + "kl": 0.578125, + "learning_rate": 5.430016863406408e-07, + "loss": 0.0006, + "reward": 3.6098439693450928, + "reward_std": 0.04654739610850811, + "rewards/final_reward": 1.3116819439689924, + "rewards/mask_iou_reward": 0.6558409719844962, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6098440289497375, + "rewards/thk_ans_format_reward": 1.0, + "step": 1355, + "think_completion_length": 34.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.640625, + "epoch": 2.2900505902192245, + "grad_norm": 5.577383873072806, + "kl": 0.533203125, + "learning_rate": 5.426644182124789e-07, + "loss": 0.0005, + "reward": 3.821853280067444, + "reward_std": 0.02090123761445284, + "rewards/final_reward": 1.8809040738056706, + "rewards/mask_iou_reward": 0.9404520369028353, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8218533396720886, + "rewards/thk_ans_format_reward": 1.0, + "step": 1356, + "think_completion_length": 37.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.078125, + "epoch": 2.2917369308600337, + "grad_norm": 12.746405765782752, + "kl": 0.55859375, + "learning_rate": 5.423271500843169e-07, + "loss": 0.0006, + "reward": 3.242617607116699, + "reward_std": 0.08481218665838242, + "rewards/final_reward": 0.8098932419567305, + "rewards/mask_iou_reward": 0.40494662097836526, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2426177263259888, + "rewards/thk_ans_format_reward": 1.0, + "step": 1357, + "think_completion_length": 38.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.71875, + "epoch": 2.2934232715008434, + "grad_norm": 6.137303940016678, + "kl": 0.55078125, + "learning_rate": 5.419898819561551e-07, + "loss": 0.0005, + "reward": 3.6336123943328857, + "reward_std": 0.1637212010100484, + "rewards/final_reward": 1.5722296014889336, + "rewards/mask_iou_reward": 0.7861148007444668, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.633612334728241, + "rewards/thk_ans_format_reward": 1.0, + "step": 1358, + "think_completion_length": 35.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.875, + "epoch": 2.2951096121416525, + "grad_norm": 8.523910222897978, + "kl": 0.521484375, + "learning_rate": 5.416526138279932e-07, + "loss": 0.0005, + "reward": 2.9611575603485107, + "reward_std": 0.5413458049297333, + "rewards/final_reward": 0.999535242586827, + "rewards/mask_iou_reward": 0.4997676212934135, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9611575305461884, + "rewards/thk_ans_format_reward": 1.0, + "step": 1359, + "think_completion_length": 39.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.703125, + "epoch": 2.296795952782462, + "grad_norm": 23.755939541075783, + "kl": 0.52734375, + "learning_rate": 5.413153456998314e-07, + "loss": 0.0005, + "reward": 3.1955270767211914, + "reward_std": 0.1914454996585846, + "rewards/final_reward": 0.6409054223441232, + "rewards/mask_iou_reward": 0.3204527111720616, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1955272555351257, + "rewards/thk_ans_format_reward": 1.0, + "step": 1360, + "think_completion_length": 34.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.890625, + "epoch": 2.2984822934232714, + "grad_norm": 5.799952126679054, + "kl": 0.64453125, + "learning_rate": 5.409780775716695e-07, + "loss": 0.0007, + "reward": 2.923759698867798, + "reward_std": 0.3102700114250183, + "rewards/final_reward": 0.4870952352978929, + "rewards/mask_iou_reward": 0.24354761764894645, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9237594902515411, + "rewards/thk_ans_format_reward": 1.0, + "step": 1361, + "think_completion_length": 35.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.046875, + "epoch": 2.300168634064081, + "grad_norm": 8.073495621179406, + "kl": 0.55859375, + "learning_rate": 5.406408094435076e-07, + "loss": 0.0006, + "reward": 3.1795326471328735, + "reward_std": 0.15434248000383377, + "rewards/final_reward": 0.7367230781079752, + "rewards/mask_iou_reward": 0.3683615390539876, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1795325875282288, + "rewards/thk_ans_format_reward": 1.0, + "step": 1362, + "think_completion_length": 40.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.46875, + "epoch": 2.30185497470489, + "grad_norm": 34.4840939789534, + "kl": 0.5546875, + "learning_rate": 5.403035413153457e-07, + "loss": 0.0006, + "reward": 3.4326142072677612, + "reward_std": 0.19122769800014794, + "rewards/final_reward": 1.3455692918990243, + "rewards/mask_iou_reward": 0.6727846459495122, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.432614266872406, + "rewards/thk_ans_format_reward": 1.0, + "step": 1363, + "think_completion_length": 41.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.0625, + "epoch": 2.3035413153457, + "grad_norm": 9.345827782662276, + "kl": 0.5, + "learning_rate": 5.399662731871838e-07, + "loss": 0.0005, + "reward": 3.6630271673202515, + "reward_std": 0.07718203030526638, + "rewards/final_reward": 1.6574970505074886, + "rewards/mask_iou_reward": 0.8287485252537443, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6630271077156067, + "rewards/thk_ans_format_reward": 1.0, + "step": 1364, + "think_completion_length": 36.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.78125, + "epoch": 2.305227655986509, + "grad_norm": 11.544908340862662, + "kl": 0.494140625, + "learning_rate": 5.396290050590219e-07, + "loss": 0.0005, + "reward": 3.705686330795288, + "reward_std": 0.05781856086105108, + "rewards/final_reward": 1.90387979568909, + "rewards/mask_iou_reward": 0.951939897844545, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7056862711906433, + "rewards/thk_ans_format_reward": 1.0, + "step": 1365, + "think_completion_length": 34.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.359375, + "epoch": 2.3069139966273187, + "grad_norm": 13.856901701164764, + "kl": 0.61328125, + "learning_rate": 5.3929173693086e-07, + "loss": 0.0006, + "reward": 3.1082570552825928, + "reward_std": 0.1994151696562767, + "rewards/final_reward": 1.4614486050282012, + "rewards/mask_iou_reward": 0.7307243025141006, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1082570552825928, + "rewards/thk_ans_format_reward": 1.0, + "step": 1366, + "think_completion_length": 37.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.125, + "epoch": 2.3086003372681283, + "grad_norm": 10.279635984822152, + "kl": 0.53125, + "learning_rate": 5.389544688026981e-07, + "loss": 0.0005, + "reward": 3.41591477394104, + "reward_std": 0.1839870810508728, + "rewards/final_reward": 1.6761410901629847, + "rewards/mask_iou_reward": 0.8380705450814924, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4159146547317505, + "rewards/thk_ans_format_reward": 1.0, + "step": 1367, + "think_completion_length": 39.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.859375, + "epoch": 2.3102866779089375, + "grad_norm": 9.9545793157659, + "kl": 0.693359375, + "learning_rate": 5.386172006745362e-07, + "loss": 0.0007, + "reward": 3.6783969402313232, + "reward_std": 0.2630709856748581, + "rewards/final_reward": 1.4409517987452778, + "rewards/mask_iou_reward": 0.7204758993726389, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6783969402313232, + "rewards/thk_ans_format_reward": 1.0, + "step": 1368, + "think_completion_length": 39.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.640625, + "epoch": 2.311973018549747, + "grad_norm": 6.504769492113551, + "kl": 0.65234375, + "learning_rate": 5.382799325463744e-07, + "loss": 0.0007, + "reward": 3.0724406242370605, + "reward_std": 0.21577691286802292, + "rewards/final_reward": 1.3656826390766597, + "rewards/mask_iou_reward": 0.6828413195383298, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0724405646324158, + "rewards/thk_ans_format_reward": 1.0, + "step": 1369, + "think_completion_length": 35.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.0, + "epoch": 2.3136593591905563, + "grad_norm": 22.322142442582493, + "kl": 0.552734375, + "learning_rate": 5.379426644182125e-07, + "loss": 0.0005, + "reward": 3.6724932193756104, + "reward_std": 0.031013024039566517, + "rewards/final_reward": 1.6675560661103672, + "rewards/mask_iou_reward": 0.8337780330551836, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6724932789802551, + "rewards/thk_ans_format_reward": 1.0, + "step": 1370, + "think_completion_length": 36.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.171875, + "epoch": 2.315345699831366, + "grad_norm": 8.858965352477439, + "kl": 0.83203125, + "learning_rate": 5.376053962900505e-07, + "loss": 0.0008, + "reward": 3.5324264764785767, + "reward_std": 0.1711833318695426, + "rewards/final_reward": 1.4827722485784687, + "rewards/mask_iou_reward": 0.7413861242892343, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5324264168739319, + "rewards/thk_ans_format_reward": 1.0, + "step": 1371, + "think_completion_length": 32.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.828125, + "epoch": 2.317032040472175, + "grad_norm": 7.364348393792841, + "kl": 0.5859375, + "learning_rate": 5.372681281618887e-07, + "loss": 0.0006, + "reward": 3.3787490129470825, + "reward_std": 0.01813412643969059, + "rewards/final_reward": 0.9400155006785603, + "rewards/mask_iou_reward": 0.47000775033928016, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.378748893737793, + "rewards/thk_ans_format_reward": 1.0, + "step": 1372, + "think_completion_length": 33.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.0625, + "epoch": 2.318718381112985, + "grad_norm": 12.402060150068934, + "kl": 0.5859375, + "learning_rate": 5.369308600337268e-07, + "loss": 0.0006, + "reward": 3.37683367729187, + "reward_std": 0.23856448754668236, + "rewards/final_reward": 1.6650647820314266, + "rewards/mask_iou_reward": 0.8325323910157133, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3768335580825806, + "rewards/thk_ans_format_reward": 1.0, + "step": 1373, + "think_completion_length": 35.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.46875, + "epoch": 2.3204047217537944, + "grad_norm": 8.192072648544633, + "kl": 0.578125, + "learning_rate": 5.365935919055648e-07, + "loss": 0.0006, + "reward": 2.7094311714172363, + "reward_std": 0.2052406631410122, + "rewards/final_reward": 0.04370178499354055, + "rewards/mask_iou_reward": 0.021850892496770274, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7094311714172363, + "rewards/thk_ans_format_reward": 1.0, + "step": 1374, + "think_completion_length": 42.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.875, + "epoch": 2.3220910623946036, + "grad_norm": 30.616425561825096, + "kl": 0.59375, + "learning_rate": 5.36256323777403e-07, + "loss": 0.0006, + "reward": 3.477543830871582, + "reward_std": 0.17288058251142502, + "rewards/final_reward": 1.4923323475422399, + "rewards/mask_iou_reward": 0.7461661737711199, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4775438904762268, + "rewards/thk_ans_format_reward": 1.0, + "step": 1375, + "think_completion_length": 38.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.359375, + "epoch": 2.3237774030354132, + "grad_norm": 9.024795177986041, + "kl": 0.580078125, + "learning_rate": 5.359190556492411e-07, + "loss": 0.0006, + "reward": 3.503230929374695, + "reward_std": 0.1244723740965128, + "rewards/final_reward": 1.5993782177815818, + "rewards/mask_iou_reward": 0.7996891088907909, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5032309293746948, + "rewards/thk_ans_format_reward": 1.0, + "step": 1376, + "think_completion_length": 38.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.140625, + "epoch": 2.3254637436762224, + "grad_norm": 6.357890399165426, + "kl": 0.533203125, + "learning_rate": 5.355817875210792e-07, + "loss": 0.0005, + "reward": 3.3854854106903076, + "reward_std": 0.2610369510948658, + "rewards/final_reward": 1.3946295125890742, + "rewards/mask_iou_reward": 0.6973147562945371, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3854854702949524, + "rewards/thk_ans_format_reward": 1.0, + "step": 1377, + "think_completion_length": 35.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.53125, + "epoch": 2.327150084317032, + "grad_norm": 11.59512324924954, + "kl": 0.54296875, + "learning_rate": 5.352445193929174e-07, + "loss": 0.0005, + "reward": 3.6056227684020996, + "reward_std": 0.031252100598067045, + "rewards/final_reward": 1.8470167898649712, + "rewards/mask_iou_reward": 0.9235083949324856, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6056227684020996, + "rewards/thk_ans_format_reward": 1.0, + "step": 1378, + "think_completion_length": 32.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.359375, + "epoch": 2.3288364249578413, + "grad_norm": 17.104449464900775, + "kl": 0.52734375, + "learning_rate": 5.349072512647554e-07, + "loss": 0.0005, + "reward": 3.4591941833496094, + "reward_std": 0.3491174578666687, + "rewards/final_reward": 1.3752283871348885, + "rewards/mask_iou_reward": 0.6876141935674442, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4591941833496094, + "rewards/thk_ans_format_reward": 1.0, + "step": 1379, + "think_completion_length": 37.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.15625, + "epoch": 2.330522765598651, + "grad_norm": 15.730667505763513, + "kl": 0.5205078125, + "learning_rate": 5.345699831365935e-07, + "loss": 0.0005, + "reward": 3.54481840133667, + "reward_std": 0.047257980331778526, + "rewards/final_reward": 1.5888374056095371, + "rewards/mask_iou_reward": 0.7944187028047686, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5448181629180908, + "rewards/thk_ans_format_reward": 1.0, + "step": 1380, + "think_completion_length": 35.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.9375, + "epoch": 2.3322091062394605, + "grad_norm": 7.4373469061295445, + "kl": 0.65625, + "learning_rate": 5.342327150084317e-07, + "loss": 0.0007, + "reward": 2.963295817375183, + "reward_std": 0.29863504134118557, + "rewards/final_reward": 1.3623242165591476, + "rewards/mask_iou_reward": 0.6811621082795738, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9632959961891174, + "rewards/thk_ans_format_reward": 1.0, + "step": 1381, + "think_completion_length": 33.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.125, + "epoch": 2.3338954468802697, + "grad_norm": 7.16589243158659, + "kl": 0.56640625, + "learning_rate": 5.338954468802698e-07, + "loss": 0.0006, + "reward": 2.9298386573791504, + "reward_std": 0.012405174784362316, + "rewards/final_reward": 0.9121590290456439, + "rewards/mask_iou_reward": 0.45607951452282197, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9298387765884399, + "rewards/thk_ans_format_reward": 1.0, + "step": 1382, + "think_completion_length": 35.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.53125, + "epoch": 2.3355817875210794, + "grad_norm": 10.52957536273546, + "kl": 0.625, + "learning_rate": 5.33558178752108e-07, + "loss": 0.0006, + "reward": 3.5053229331970215, + "reward_std": 0.2050390988588333, + "rewards/final_reward": 1.8158258342784386, + "rewards/mask_iou_reward": 0.9079129171392193, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5053229928016663, + "rewards/thk_ans_format_reward": 1.0, + "step": 1383, + "think_completion_length": 33.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.5, + "epoch": 2.3372681281618886, + "grad_norm": 6.731365865114964, + "kl": 0.560546875, + "learning_rate": 5.33220910623946e-07, + "loss": 0.0006, + "reward": 2.6403086185455322, + "reward_std": 0.09763676300644875, + "rewards/final_reward": 0.785273539529477, + "rewards/mask_iou_reward": 0.3926367697647385, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6403087079524994, + "rewards/thk_ans_format_reward": 1.0, + "step": 1384, + "think_completion_length": 42.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.65625, + "epoch": 2.338954468802698, + "grad_norm": 6.818050851838744, + "kl": 0.5400390625, + "learning_rate": 5.328836424957841e-07, + "loss": 0.0005, + "reward": 3.666144609451294, + "reward_std": 0.0915786512196064, + "rewards/final_reward": 1.7479222751329424, + "rewards/mask_iou_reward": 0.8739611375664712, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6661444902420044, + "rewards/thk_ans_format_reward": 1.0, + "step": 1385, + "think_completion_length": 36.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.453125, + "epoch": 2.3406408094435074, + "grad_norm": 4.538761263469013, + "kl": 0.564453125, + "learning_rate": 5.325463743676223e-07, + "loss": 0.0006, + "reward": 3.2571221590042114, + "reward_std": 0.03687620488926768, + "rewards/final_reward": 0.650422243607985, + "rewards/mask_iou_reward": 0.3252111218039925, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2571222186088562, + "rewards/thk_ans_format_reward": 1.0, + "step": 1386, + "think_completion_length": 38.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.21875, + "epoch": 2.342327150084317, + "grad_norm": 14.347621308169337, + "kl": 0.509765625, + "learning_rate": 5.322091062394604e-07, + "loss": 0.0005, + "reward": 3.35020649433136, + "reward_std": 0.33967210724949837, + "rewards/final_reward": 1.5729735712665027, + "rewards/mask_iou_reward": 0.7864867856332514, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3502064943313599, + "rewards/thk_ans_format_reward": 1.0, + "step": 1387, + "think_completion_length": 40.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.34375, + "epoch": 2.3440134907251267, + "grad_norm": 15.164151268757067, + "kl": 0.51953125, + "learning_rate": 5.318718381112984e-07, + "loss": 0.0005, + "reward": 3.322817802429199, + "reward_std": 0.27229122817516327, + "rewards/final_reward": 1.2874370263274848, + "rewards/mask_iou_reward": 0.6437185131637424, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3228177428245544, + "rewards/thk_ans_format_reward": 1.0, + "step": 1388, + "think_completion_length": 38.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.9375, + "epoch": 2.345699831365936, + "grad_norm": 11.567337138167554, + "kl": 0.576171875, + "learning_rate": 5.315345699831366e-07, + "loss": 0.0006, + "reward": 3.3506091833114624, + "reward_std": 0.03909984044730663, + "rewards/final_reward": 0.9120567229378608, + "rewards/mask_iou_reward": 0.4560283614689304, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3506091237068176, + "rewards/thk_ans_format_reward": 1.0, + "step": 1389, + "think_completion_length": 39.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.0625, + "epoch": 2.3473861720067455, + "grad_norm": 27.705185381349548, + "kl": 0.59765625, + "learning_rate": 5.311973018549747e-07, + "loss": 0.0006, + "reward": 3.444739580154419, + "reward_std": 0.282623004168272, + "rewards/final_reward": 0.9685749597290699, + "rewards/mask_iou_reward": 0.48428747986453496, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4447395205497742, + "rewards/thk_ans_format_reward": 1.0, + "step": 1390, + "think_completion_length": 34.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.078125, + "epoch": 2.3490725126475547, + "grad_norm": 10.792504700501967, + "kl": 0.4921875, + "learning_rate": 5.308600337268128e-07, + "loss": 0.0005, + "reward": 2.987342119216919, + "reward_std": 0.561201810836792, + "rewards/final_reward": 1.012290295770121, + "rewards/mask_iou_reward": 0.5061451478850605, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9873422086238861, + "rewards/thk_ans_format_reward": 1.0, + "step": 1391, + "think_completion_length": 45.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.421875, + "epoch": 2.3507588532883643, + "grad_norm": 7.9102889207307685, + "kl": 0.546875, + "learning_rate": 5.30522765598651e-07, + "loss": 0.0006, + "reward": 3.211172580718994, + "reward_std": 0.06296231271699071, + "rewards/final_reward": 1.4353681768739848, + "rewards/mask_iou_reward": 0.7176840884369924, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2111724019050598, + "rewards/thk_ans_format_reward": 1.0, + "step": 1392, + "think_completion_length": 37.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.5, + "epoch": 2.3524451939291735, + "grad_norm": 8.891253140343897, + "kl": 0.5625, + "learning_rate": 5.30185497470489e-07, + "loss": 0.0006, + "reward": 3.3555647134780884, + "reward_std": 0.22218644618988037, + "rewards/final_reward": 1.2452854457618507, + "rewards/mask_iou_reward": 0.6226427228809254, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3555646538734436, + "rewards/thk_ans_format_reward": 1.0, + "step": 1393, + "think_completion_length": 35.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.609375, + "epoch": 2.354131534569983, + "grad_norm": 11.443989159249242, + "kl": 0.54296875, + "learning_rate": 5.298482293423271e-07, + "loss": 0.0005, + "reward": 3.45237934589386, + "reward_std": 0.16156933456659317, + "rewards/final_reward": 1.792342337405704, + "rewards/mask_iou_reward": 0.896171168702852, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.452379286289215, + "rewards/thk_ans_format_reward": 1.0, + "step": 1394, + "think_completion_length": 36.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.515625, + "epoch": 2.3558178752107928, + "grad_norm": 8.3765850159821, + "kl": 0.67578125, + "learning_rate": 5.295109612141653e-07, + "loss": 0.0007, + "reward": 3.2284774780273438, + "reward_std": 0.22332235658541322, + "rewards/final_reward": 1.5533457099388035, + "rewards/mask_iou_reward": 0.7766728549694017, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2284774482250214, + "rewards/thk_ans_format_reward": 1.0, + "step": 1395, + "think_completion_length": 39.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.015625, + "epoch": 2.357504215851602, + "grad_norm": 17.58200086525832, + "kl": 0.54296875, + "learning_rate": 5.291736930860033e-07, + "loss": 0.0005, + "reward": 3.1018285751342773, + "reward_std": 0.35995006561279297, + "rewards/final_reward": 1.0164651908095734, + "rewards/mask_iou_reward": 0.5082325954047867, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1018284559249878, + "rewards/thk_ans_format_reward": 1.0, + "step": 1396, + "think_completion_length": 37.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.078125, + "epoch": 2.3591905564924116, + "grad_norm": 22.290024986668044, + "kl": 0.52734375, + "learning_rate": 5.288364249578414e-07, + "loss": 0.0005, + "reward": 2.9656145572662354, + "reward_std": 0.2921905219554901, + "rewards/final_reward": 0.6604535228611185, + "rewards/mask_iou_reward": 0.33022676143055923, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9656146168708801, + "rewards/thk_ans_format_reward": 1.0, + "step": 1397, + "think_completion_length": 35.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.53125, + "epoch": 2.360876897133221, + "grad_norm": 3.9345173173490138, + "kl": 0.541015625, + "learning_rate": 5.284991568296796e-07, + "loss": 0.0006, + "reward": 3.059117913246155, + "reward_std": 0.005253995528619271, + "rewards/final_reward": 1.965131045395031, + "rewards/mask_iou_reward": 0.9825655226975155, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0591178834438324, + "rewards/thk_ans_format_reward": 1.0, + "step": 1398, + "think_completion_length": 37.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.875, + "epoch": 2.3625632377740304, + "grad_norm": 7.9093384724318865, + "kl": 0.568359375, + "learning_rate": 5.281618887015177e-07, + "loss": 0.0006, + "reward": 3.475532650947571, + "reward_std": 0.1894139125943184, + "rewards/final_reward": 1.1372858494947948, + "rewards/mask_iou_reward": 0.5686429247473974, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4755328297615051, + "rewards/thk_ans_format_reward": 1.0, + "step": 1399, + "think_completion_length": 39.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.484375, + "epoch": 2.3642495784148396, + "grad_norm": 6.224619860019826, + "kl": 0.564453125, + "learning_rate": 5.278246205733558e-07, + "loss": 0.0006, + "reward": 2.9742729663848877, + "reward_std": 0.09473420679569244, + "rewards/final_reward": 0.23003473812021966, + "rewards/mask_iou_reward": 0.11501736906010983, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9742728471755981, + "rewards/thk_ans_format_reward": 1.0, + "step": 1400, + "think_completion_length": 35.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.46875, + "epoch": 2.3659359190556493, + "grad_norm": 10.832806018074743, + "kl": 0.50390625, + "learning_rate": 5.27487352445194e-07, + "loss": 0.0005, + "reward": 3.378462791442871, + "reward_std": 0.21569720469415188, + "rewards/final_reward": 1.5557822843293696, + "rewards/mask_iou_reward": 0.7778911421646848, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.378462791442871, + "rewards/thk_ans_format_reward": 1.0, + "step": 1401, + "think_completion_length": 38.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.046875, + "epoch": 2.367622259696459, + "grad_norm": 7.501941757157272, + "kl": 0.5859375, + "learning_rate": 5.27150084317032e-07, + "loss": 0.0006, + "reward": 3.699384927749634, + "reward_std": 0.06814133375883102, + "rewards/final_reward": 1.5414973015209585, + "rewards/mask_iou_reward": 0.7707486507604793, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6993848085403442, + "rewards/thk_ans_format_reward": 1.0, + "step": 1402, + "think_completion_length": 37.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.578125, + "epoch": 2.369308600337268, + "grad_norm": 7.5412459195508035, + "kl": 0.572265625, + "learning_rate": 5.268128161888701e-07, + "loss": 0.0006, + "reward": 3.515091300010681, + "reward_std": 0.18088901042938232, + "rewards/final_reward": 1.3727902363585298, + "rewards/mask_iou_reward": 0.6863951181792649, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5150913000106812, + "rewards/thk_ans_format_reward": 1.0, + "step": 1403, + "think_completion_length": 33.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.9375, + "epoch": 2.3709949409780777, + "grad_norm": 9.174490098877436, + "kl": 0.56640625, + "learning_rate": 5.264755480607082e-07, + "loss": 0.0006, + "reward": 2.920655369758606, + "reward_std": 0.1659610359929502, + "rewards/final_reward": 1.3589814228169435, + "rewards/mask_iou_reward": 0.6794907114084717, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9206554293632507, + "rewards/thk_ans_format_reward": 1.0, + "step": 1404, + "think_completion_length": 36.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.609375, + "epoch": 2.372681281618887, + "grad_norm": 5.524729092924367, + "kl": 0.61328125, + "learning_rate": 5.261382799325463e-07, + "loss": 0.0006, + "reward": 3.408183217048645, + "reward_std": 0.11358396708965302, + "rewards/final_reward": 1.3904137277971458, + "rewards/mask_iou_reward": 0.6952068638985729, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4081830978393555, + "rewards/thk_ans_format_reward": 1.0, + "step": 1405, + "think_completion_length": 33.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.984375, + "epoch": 2.3743676222596966, + "grad_norm": 9.897592505644342, + "kl": 0.568359375, + "learning_rate": 5.258010118043844e-07, + "loss": 0.0006, + "reward": 3.7648085355758667, + "reward_std": 0.1485668420791626, + "rewards/final_reward": 1.8613900782667594, + "rewards/mask_iou_reward": 0.9306950391333797, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7648085355758667, + "rewards/thk_ans_format_reward": 1.0, + "step": 1406, + "think_completion_length": 38.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.640625, + "epoch": 2.3760539629005057, + "grad_norm": 14.760801593326821, + "kl": 0.58984375, + "learning_rate": 5.254637436762226e-07, + "loss": 0.0006, + "reward": 3.5451170206069946, + "reward_std": 0.046211473643779755, + "rewards/final_reward": 1.759196898924725, + "rewards/mask_iou_reward": 0.8795984494623625, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5451170802116394, + "rewards/thk_ans_format_reward": 1.0, + "step": 1407, + "think_completion_length": 35.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.4375, + "epoch": 2.3777403035413154, + "grad_norm": 6.238961683518768, + "kl": 0.7265625, + "learning_rate": 5.251264755480607e-07, + "loss": 0.0007, + "reward": 3.763440251350403, + "reward_std": 0.28513549268245697, + "rewards/final_reward": 1.729252268258402, + "rewards/mask_iou_reward": 0.864626134129201, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7634401321411133, + "rewards/thk_ans_format_reward": 1.0, + "step": 1408, + "think_completion_length": 33.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.828125, + "epoch": 2.379426644182125, + "grad_norm": 7.543950870667907, + "kl": 0.50390625, + "learning_rate": 5.247892074198989e-07, + "loss": 0.0005, + "reward": 3.1753127574920654, + "reward_std": 0.25032037193886936, + "rewards/final_reward": 1.5657431095470087, + "rewards/mask_iou_reward": 0.7828715547735043, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.175312876701355, + "rewards/thk_ans_format_reward": 1.0, + "step": 1409, + "think_completion_length": 42.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.578125, + "epoch": 2.381112984822934, + "grad_norm": 7.826650646237432, + "kl": 0.533203125, + "learning_rate": 5.24451939291737e-07, + "loss": 0.0005, + "reward": 3.2201855182647705, + "reward_std": 0.26124662533402443, + "rewards/final_reward": 1.5676906010913358, + "rewards/mask_iou_reward": 0.7838453005456679, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2201855182647705, + "rewards/thk_ans_format_reward": 1.0, + "step": 1410, + "think_completion_length": 37.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.265625, + "epoch": 2.382799325463744, + "grad_norm": 14.172162637197918, + "kl": 0.515625, + "learning_rate": 5.24114671163575e-07, + "loss": 0.0006, + "reward": 3.4743032455444336, + "reward_std": 0.19433462619781494, + "rewards/final_reward": 1.671436299136126, + "rewards/mask_iou_reward": 0.835718149568063, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.474303126335144, + "rewards/thk_ans_format_reward": 1.0, + "step": 1411, + "think_completion_length": 36.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.75, + "epoch": 2.384485666104553, + "grad_norm": 13.63952704554102, + "kl": 0.7275390625, + "learning_rate": 5.237774030354132e-07, + "loss": 0.0007, + "reward": 3.3273757696151733, + "reward_std": 0.11718492582440376, + "rewards/final_reward": 1.339164020065489, + "rewards/mask_iou_reward": 0.6695820100327445, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3273758292198181, + "rewards/thk_ans_format_reward": 1.0, + "step": 1412, + "think_completion_length": 41.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.234375, + "epoch": 2.3861720067453627, + "grad_norm": 9.02357872456789, + "kl": 0.55078125, + "learning_rate": 5.234401349072512e-07, + "loss": 0.0005, + "reward": 3.219269037246704, + "reward_std": 0.408921817317605, + "rewards/final_reward": 1.103784555988953, + "rewards/mask_iou_reward": 0.5518922779944765, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2192689776420593, + "rewards/thk_ans_format_reward": 1.0, + "step": 1413, + "think_completion_length": 39.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.4375, + "epoch": 2.387858347386172, + "grad_norm": 5.269468878977596, + "kl": 0.537109375, + "learning_rate": 5.231028667790893e-07, + "loss": 0.0005, + "reward": 3.221733808517456, + "reward_std": 0.255710706114769, + "rewards/final_reward": 1.6817738791159162, + "rewards/mask_iou_reward": 0.8408869395579581, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2217336893081665, + "rewards/thk_ans_format_reward": 1.0, + "step": 1414, + "think_completion_length": 35.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.65625, + "epoch": 2.3895446880269815, + "grad_norm": 11.668623025824209, + "kl": 0.59375, + "learning_rate": 5.227655986509275e-07, + "loss": 0.0006, + "reward": 2.9161574840545654, + "reward_std": 0.1359611563384533, + "rewards/final_reward": 0.3541008718656232, + "rewards/mask_iou_reward": 0.1770504359328116, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9161574840545654, + "rewards/thk_ans_format_reward": 1.0, + "step": 1415, + "think_completion_length": 37.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.203125, + "epoch": 2.391231028667791, + "grad_norm": 6.835618298256512, + "kl": 0.55859375, + "learning_rate": 5.224283305227656e-07, + "loss": 0.0006, + "reward": 2.6472376585006714, + "reward_std": 0.19794801366515458, + "rewards/final_reward": 0.05851637501519018, + "rewards/mask_iou_reward": 0.02925818750759509, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6472376435995102, + "rewards/thk_ans_format_reward": 1.0, + "step": 1416, + "think_completion_length": 36.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.71875, + "epoch": 2.3929173693086003, + "grad_norm": 11.735490794711714, + "kl": 0.58984375, + "learning_rate": 5.220910623946037e-07, + "loss": 0.0006, + "reward": 3.3064658641815186, + "reward_std": 0.09207919798791409, + "rewards/final_reward": 1.5212218066720333, + "rewards/mask_iou_reward": 0.7606109033360167, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3064658045768738, + "rewards/thk_ans_format_reward": 1.0, + "step": 1417, + "think_completion_length": 43.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.71875, + "epoch": 2.39460370994941, + "grad_norm": 7.427700460484598, + "kl": 0.689453125, + "learning_rate": 5.217537942664419e-07, + "loss": 0.0007, + "reward": 3.496484875679016, + "reward_std": 0.27999068424105644, + "rewards/final_reward": 1.4666861887157916, + "rewards/mask_iou_reward": 0.7333430943578958, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.496484637260437, + "rewards/thk_ans_format_reward": 1.0, + "step": 1418, + "think_completion_length": 35.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.046875, + "epoch": 2.396290050590219, + "grad_norm": 6.7558319839546215, + "kl": 0.580078125, + "learning_rate": 5.214165261382799e-07, + "loss": 0.0006, + "reward": 3.452078342437744, + "reward_std": 0.18629483878612518, + "rewards/final_reward": 1.9352354965872987, + "rewards/mask_iou_reward": 0.9676177482936493, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4520783424377441, + "rewards/thk_ans_format_reward": 1.0, + "step": 1419, + "think_completion_length": 38.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.515625, + "epoch": 2.397976391231029, + "grad_norm": 19.13059374771945, + "kl": 0.55859375, + "learning_rate": 5.21079258010118e-07, + "loss": 0.0006, + "reward": 3.172434687614441, + "reward_std": 0.021215507294982672, + "rewards/final_reward": 1.0803966303114596, + "rewards/mask_iou_reward": 0.5401983151557298, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1724347174167633, + "rewards/thk_ans_format_reward": 1.0, + "step": 1420, + "think_completion_length": 39.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.4375, + "epoch": 2.399662731871838, + "grad_norm": 8.533750845914334, + "kl": 0.572265625, + "learning_rate": 5.207419898819561e-07, + "loss": 0.0006, + "reward": 3.4311397075653076, + "reward_std": 0.12119658989831805, + "rewards/final_reward": 1.1740384616766832, + "rewards/mask_iou_reward": 0.5870192308383416, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.431139886379242, + "rewards/thk_ans_format_reward": 1.0, + "step": 1421, + "think_completion_length": 44.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.609375, + "epoch": 2.4013490725126476, + "grad_norm": 8.827783645808221, + "kl": 0.578125, + "learning_rate": 5.204047217537942e-07, + "loss": 0.0006, + "reward": 3.4205384254455566, + "reward_std": 0.13982452638447285, + "rewards/final_reward": 1.4854884004504718, + "rewards/mask_iou_reward": 0.7427442002252359, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4205384850502014, + "rewards/thk_ans_format_reward": 1.0, + "step": 1422, + "think_completion_length": 38.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.328125, + "epoch": 2.403035413153457, + "grad_norm": 6.05465436982932, + "kl": 1.525390625, + "learning_rate": 5.200674536256323e-07, + "loss": 0.0015, + "reward": 3.809617519378662, + "reward_std": 0.1390428734011948, + "rewards/final_reward": 1.732256359496069, + "rewards/mask_iou_reward": 0.8661281797480345, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8096173405647278, + "rewards/thk_ans_format_reward": 1.0, + "step": 1423, + "think_completion_length": 41.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.90625, + "epoch": 2.4047217537942664, + "grad_norm": 7.545060038565714, + "kl": 0.529296875, + "learning_rate": 5.197301854974705e-07, + "loss": 0.0005, + "reward": 3.338720679283142, + "reward_std": 0.09901190176606178, + "rewards/final_reward": 1.2441309370313292, + "rewards/mask_iou_reward": 0.6220654685156646, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3387206196784973, + "rewards/thk_ans_format_reward": 1.0, + "step": 1424, + "think_completion_length": 38.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.390625, + "epoch": 2.4064080944350756, + "grad_norm": 8.125363416747533, + "kl": 0.6171875, + "learning_rate": 5.193929173693086e-07, + "loss": 0.0006, + "reward": 3.300279378890991, + "reward_std": 0.07601998746395111, + "rewards/final_reward": 1.452418104648395, + "rewards/mask_iou_reward": 0.7262090523241975, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3002793192863464, + "rewards/thk_ans_format_reward": 1.0, + "step": 1425, + "think_completion_length": 34.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.6875, + "epoch": 2.4080944350758853, + "grad_norm": 14.793698598847602, + "kl": 0.552734375, + "learning_rate": 5.190556492411467e-07, + "loss": 0.0006, + "reward": 3.349576950073242, + "reward_std": 0.06855934672057629, + "rewards/final_reward": 1.3882612799502339, + "rewards/mask_iou_reward": 0.6941306399751169, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3495770692825317, + "rewards/thk_ans_format_reward": 1.0, + "step": 1426, + "think_completion_length": 36.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.328125, + "epoch": 2.409780775716695, + "grad_norm": 22.392515052220187, + "kl": 0.63671875, + "learning_rate": 5.187183811129849e-07, + "loss": 0.0006, + "reward": 3.1435389518737793, + "reward_std": 0.22166889160871506, + "rewards/final_reward": 0.7710225803234687, + "rewards/mask_iou_reward": 0.38551129016173435, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.143539011478424, + "rewards/thk_ans_format_reward": 1.0, + "step": 1427, + "think_completion_length": 41.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.84375, + "epoch": 2.411467116357504, + "grad_norm": 5.926089505112166, + "kl": 0.515625, + "learning_rate": 5.183811129848229e-07, + "loss": 0.0005, + "reward": 3.498886823654175, + "reward_std": 0.2120041623711586, + "rewards/final_reward": 1.5323652788101025, + "rewards/mask_iou_reward": 0.7661826394050513, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4988868236541748, + "rewards/thk_ans_format_reward": 1.0, + "step": 1428, + "think_completion_length": 41.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.234375, + "epoch": 2.4131534569983137, + "grad_norm": 6.680627025015385, + "kl": 0.5703125, + "learning_rate": 5.180438448566609e-07, + "loss": 0.0006, + "reward": 3.243508219718933, + "reward_std": 0.17464113235473633, + "rewards/final_reward": 1.0407643744754143, + "rewards/mask_iou_reward": 0.5203821872377071, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.243508219718933, + "rewards/thk_ans_format_reward": 1.0, + "step": 1429, + "think_completion_length": 40.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.921875, + "epoch": 2.414839797639123, + "grad_norm": 8.701499831433182, + "kl": 0.568359375, + "learning_rate": 5.177065767284991e-07, + "loss": 0.0006, + "reward": 2.999347448348999, + "reward_std": 0.16424234956502914, + "rewards/final_reward": 1.4692365949521713, + "rewards/mask_iou_reward": 0.7346182974760856, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9993474781513214, + "rewards/thk_ans_format_reward": 1.0, + "step": 1430, + "think_completion_length": 38.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.796875, + "epoch": 2.4165261382799326, + "grad_norm": 8.019057025347687, + "kl": 0.5146484375, + "learning_rate": 5.173693086003372e-07, + "loss": 0.0005, + "reward": 3.8532402515411377, + "reward_std": 0.02663713227957487, + "rewards/final_reward": 1.9138304988010595, + "rewards/mask_iou_reward": 0.9569152494005297, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8532403707504272, + "rewards/thk_ans_format_reward": 1.0, + "step": 1431, + "think_completion_length": 36.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.5, + "epoch": 2.4182124789207418, + "grad_norm": 10.521478360313901, + "kl": 0.69921875, + "learning_rate": 5.170320404721753e-07, + "loss": 0.0007, + "reward": 3.5484408140182495, + "reward_std": 0.04367404989898205, + "rewards/final_reward": 1.6665183138121653, + "rewards/mask_iou_reward": 0.8332591569060827, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.54844069480896, + "rewards/thk_ans_format_reward": 1.0, + "step": 1432, + "think_completion_length": 35.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.40625, + "epoch": 2.4198988195615514, + "grad_norm": 11.393237382787158, + "kl": 0.5859375, + "learning_rate": 5.166947723440135e-07, + "loss": 0.0006, + "reward": 3.6145013570785522, + "reward_std": 0.08510691672563553, + "rewards/final_reward": 1.6453253678397484, + "rewards/mask_iou_reward": 0.8226626839198742, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6145014762878418, + "rewards/thk_ans_format_reward": 1.0, + "step": 1433, + "think_completion_length": 37.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.71875, + "epoch": 2.421585160202361, + "grad_norm": 12.955716447139663, + "kl": 0.517578125, + "learning_rate": 5.163575042158516e-07, + "loss": 0.0005, + "reward": 3.325462579727173, + "reward_std": 0.31851503252983093, + "rewards/final_reward": 1.244566120650251, + "rewards/mask_iou_reward": 0.6222830603251255, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.325462520122528, + "rewards/thk_ans_format_reward": 1.0, + "step": 1434, + "think_completion_length": 39.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.796875, + "epoch": 2.4232715008431702, + "grad_norm": 26.17046760870296, + "kl": 0.58984375, + "learning_rate": 5.160202360876898e-07, + "loss": 0.0006, + "reward": 3.147235631942749, + "reward_std": 0.26723285019397736, + "rewards/final_reward": 1.2566699240865904, + "rewards/mask_iou_reward": 0.6283349620432952, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1472358107566833, + "rewards/thk_ans_format_reward": 1.0, + "step": 1435, + "think_completion_length": 37.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.765625, + "epoch": 2.42495784148398, + "grad_norm": 8.568795937105243, + "kl": 0.615234375, + "learning_rate": 5.156829679595279e-07, + "loss": 0.0006, + "reward": 3.0845930576324463, + "reward_std": 0.33267538249492645, + "rewards/final_reward": 1.0768244291067677, + "rewards/mask_iou_reward": 0.5384122145533838, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.1158430576324463, + "rewards/thk_ans_format_reward": 0.984375, + "step": 1436, + "think_completion_length": 37.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.015625, + "epoch": 2.426644182124789, + "grad_norm": 86.30501173717676, + "kl": 0.556640625, + "learning_rate": 5.153456998313658e-07, + "loss": 0.0005, + "reward": 3.022960066795349, + "reward_std": 0.30731740966439247, + "rewards/final_reward": 1.3756990503386066, + "rewards/mask_iou_reward": 0.6878495251693033, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0229600071907043, + "rewards/thk_ans_format_reward": 1.0, + "step": 1437, + "think_completion_length": 39.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.09375, + "epoch": 2.4283305227655987, + "grad_norm": 19.65157587762331, + "kl": 0.6171875, + "learning_rate": 5.15008431703204e-07, + "loss": 0.0006, + "reward": 3.5178322792053223, + "reward_std": 0.22085876762866974, + "rewards/final_reward": 1.7174181471389547, + "rewards/mask_iou_reward": 0.8587090735694773, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.517832338809967, + "rewards/thk_ans_format_reward": 1.0, + "step": 1438, + "think_completion_length": 38.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.375, + "epoch": 2.430016863406408, + "grad_norm": 7.63639577539678, + "kl": 0.546875, + "learning_rate": 5.146711635750421e-07, + "loss": 0.0005, + "reward": 3.086588501930237, + "reward_std": 0.10986323654651642, + "rewards/final_reward": 1.012658648915526, + "rewards/mask_iou_reward": 0.506329324457763, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0865885615348816, + "rewards/thk_ans_format_reward": 1.0, + "step": 1439, + "think_completion_length": 39.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.328125, + "epoch": 2.4317032040472175, + "grad_norm": 8.580362862361007, + "kl": 0.615234375, + "learning_rate": 5.143338954468802e-07, + "loss": 0.0006, + "reward": 3.609292507171631, + "reward_std": 0.2019364982843399, + "rewards/final_reward": 1.5419548230177922, + "rewards/mask_iou_reward": 0.7709774115088961, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.609292447566986, + "rewards/thk_ans_format_reward": 1.0, + "step": 1440, + "think_completion_length": 34.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.578125, + "epoch": 2.433389544688027, + "grad_norm": 6.399228270220903, + "kl": 0.626953125, + "learning_rate": 5.139966273187184e-07, + "loss": 0.0006, + "reward": 3.74202823638916, + "reward_std": 0.01405814103782177, + "rewards/final_reward": 1.6983815387364867, + "rewards/mask_iou_reward": 0.8491907693682433, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7420281767845154, + "rewards/thk_ans_format_reward": 1.0, + "step": 1441, + "think_completion_length": 37.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.890625, + "epoch": 2.4350758853288363, + "grad_norm": 7.7051284301611584, + "kl": 0.68359375, + "learning_rate": 5.136593591905565e-07, + "loss": 0.0007, + "reward": 3.153511881828308, + "reward_std": 0.1515724379569292, + "rewards/final_reward": 1.4864379555430862, + "rewards/mask_iou_reward": 0.7432189777715431, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1535118222236633, + "rewards/thk_ans_format_reward": 1.0, + "step": 1442, + "think_completion_length": 37.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.234375, + "epoch": 2.436762225969646, + "grad_norm": 5.5674408687940335, + "kl": 0.5859375, + "learning_rate": 5.133220910623946e-07, + "loss": 0.0006, + "reward": 3.341616630554199, + "reward_std": 0.328810915350914, + "rewards/final_reward": 1.717067249798786, + "rewards/mask_iou_reward": 0.858533624899393, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3416166305541992, + "rewards/thk_ans_format_reward": 1.0, + "step": 1443, + "think_completion_length": 35.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.265625, + "epoch": 2.438448566610455, + "grad_norm": 6.598616436762747, + "kl": 0.529296875, + "learning_rate": 5.129848229342328e-07, + "loss": 0.0006, + "reward": 3.5519330501556396, + "reward_std": 0.10587704600766301, + "rewards/final_reward": 1.7283281092139209, + "rewards/mask_iou_reward": 0.8641640546069604, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.55193293094635, + "rewards/thk_ans_format_reward": 1.0, + "step": 1444, + "think_completion_length": 37.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.21875, + "epoch": 2.440134907251265, + "grad_norm": 9.55776911413741, + "kl": 0.552734375, + "learning_rate": 5.126475548060709e-07, + "loss": 0.0006, + "reward": 3.2191598415374756, + "reward_std": 0.12895439565181732, + "rewards/final_reward": 1.3709724588566616, + "rewards/mask_iou_reward": 0.6854862294283308, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2191597819328308, + "rewards/thk_ans_format_reward": 1.0, + "step": 1445, + "think_completion_length": 35.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.609375, + "epoch": 2.441821247892074, + "grad_norm": 7.083086725950559, + "kl": 0.65625, + "learning_rate": 5.123102866779088e-07, + "loss": 0.0007, + "reward": 3.472890257835388, + "reward_std": 0.09058744460344315, + "rewards/final_reward": 1.7449738984243224, + "rewards/mask_iou_reward": 0.8724869492121612, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4728901982307434, + "rewards/thk_ans_format_reward": 1.0, + "step": 1446, + "think_completion_length": 37.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.125, + "epoch": 2.4435075885328836, + "grad_norm": 6.919845641855438, + "kl": 0.615234375, + "learning_rate": 5.11973018549747e-07, + "loss": 0.0006, + "reward": 3.4161049127578735, + "reward_std": 0.2600807845592499, + "rewards/final_reward": 1.4440276593637122, + "rewards/mask_iou_reward": 0.7220138296818561, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4161049723625183, + "rewards/thk_ans_format_reward": 1.0, + "step": 1447, + "think_completion_length": 36.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.078125, + "epoch": 2.4451939291736933, + "grad_norm": 11.199767302410693, + "kl": 0.58984375, + "learning_rate": 5.116357504215851e-07, + "loss": 0.0006, + "reward": 3.305394768714905, + "reward_std": 0.12580876052379608, + "rewards/final_reward": 0.9565860016690916, + "rewards/mask_iou_reward": 0.4782930008345458, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3053947687149048, + "rewards/thk_ans_format_reward": 1.0, + "step": 1448, + "think_completion_length": 36.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.828125, + "epoch": 2.4468802698145025, + "grad_norm": 7.822717881138574, + "kl": 0.55078125, + "learning_rate": 5.112984822934232e-07, + "loss": 0.0006, + "reward": 3.0629160404205322, + "reward_std": 0.16106662526726723, + "rewards/final_reward": 1.2339833151801627, + "rewards/mask_iou_reward": 0.6169916575900813, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.062916100025177, + "rewards/thk_ans_format_reward": 1.0, + "step": 1449, + "think_completion_length": 37.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.28125, + "epoch": 2.448566610455312, + "grad_norm": 7.477339217559345, + "kl": 0.5234375, + "learning_rate": 5.109612141652614e-07, + "loss": 0.0005, + "reward": 3.340023159980774, + "reward_std": 0.13386711478233337, + "rewards/final_reward": 1.5786122566293084, + "rewards/mask_iou_reward": 0.7893061283146542, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3400230407714844, + "rewards/thk_ans_format_reward": 1.0, + "step": 1450, + "think_completion_length": 44.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.640625, + "epoch": 2.4502529510961213, + "grad_norm": 20.58704918234177, + "kl": 0.5234375, + "learning_rate": 5.106239460370995e-07, + "loss": 0.0005, + "reward": 3.4118305444717407, + "reward_std": 0.33906523138284683, + "rewards/final_reward": 1.5533910212416537, + "rewards/mask_iou_reward": 0.7766955106208269, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4118306040763855, + "rewards/thk_ans_format_reward": 1.0, + "step": 1451, + "think_completion_length": 34.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.28125, + "epoch": 2.451939291736931, + "grad_norm": 7.301675768677225, + "kl": 0.8359375, + "learning_rate": 5.102866779089376e-07, + "loss": 0.0008, + "reward": 3.0271406173706055, + "reward_std": 0.18788279592990875, + "rewards/final_reward": 0.800656352545829, + "rewards/mask_iou_reward": 0.4003281762729145, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0271407067775726, + "rewards/thk_ans_format_reward": 1.0, + "step": 1452, + "think_completion_length": 39.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.84375, + "epoch": 2.45362563237774, + "grad_norm": 7.854013631225384, + "kl": 0.55078125, + "learning_rate": 5.099494097807758e-07, + "loss": 0.0005, + "reward": 3.3608494997024536, + "reward_std": 0.11720556672662497, + "rewards/final_reward": 1.8089525754976725, + "rewards/mask_iou_reward": 0.9044762877488363, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3608494997024536, + "rewards/thk_ans_format_reward": 1.0, + "step": 1453, + "think_completion_length": 41.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.5625, + "epoch": 2.4553119730185498, + "grad_norm": 10.7742730214391, + "kl": 0.578125, + "learning_rate": 5.096121416526137e-07, + "loss": 0.0006, + "reward": 3.0832302570343018, + "reward_std": 0.21278557181358337, + "rewards/final_reward": 1.304668780640139, + "rewards/mask_iou_reward": 0.6523343903200695, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.083230197429657, + "rewards/thk_ans_format_reward": 1.0, + "step": 1454, + "think_completion_length": 35.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.84375, + "epoch": 2.4569983136593594, + "grad_norm": 9.536636881138167, + "kl": 0.55859375, + "learning_rate": 5.092748735244518e-07, + "loss": 0.0006, + "reward": 3.2125355005264282, + "reward_std": 0.08188419789075851, + "rewards/final_reward": 1.5690705940701015, + "rewards/mask_iou_reward": 0.7845352970350508, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.212535560131073, + "rewards/thk_ans_format_reward": 1.0, + "step": 1455, + "think_completion_length": 38.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.703125, + "epoch": 2.4586846543001686, + "grad_norm": 9.80674586850884, + "kl": 0.556640625, + "learning_rate": 5.0893760539629e-07, + "loss": 0.0006, + "reward": 3.300530433654785, + "reward_std": 0.08604636648669839, + "rewards/final_reward": 1.023525441031889, + "rewards/mask_iou_reward": 0.5117627205159445, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3005304336547852, + "rewards/thk_ans_format_reward": 1.0, + "step": 1456, + "think_completion_length": 45.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.5, + "epoch": 2.460370994940978, + "grad_norm": 5.343467075152932, + "kl": 0.6171875, + "learning_rate": 5.086003372681281e-07, + "loss": 0.0006, + "reward": 3.5373661518096924, + "reward_std": 0.1607318501919508, + "rewards/final_reward": 1.1570778227331253, + "rewards/mask_iou_reward": 0.5785389113665627, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5373662114143372, + "rewards/thk_ans_format_reward": 1.0, + "step": 1457, + "think_completion_length": 41.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.53125, + "epoch": 2.4620573355817874, + "grad_norm": 5.496156238647655, + "kl": 0.546875, + "learning_rate": 5.082630691399663e-07, + "loss": 0.0005, + "reward": 3.036491870880127, + "reward_std": 0.05605571623891592, + "rewards/final_reward": 1.1909960638484658, + "rewards/mask_iou_reward": 0.5954980319242329, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.036491721868515, + "rewards/thk_ans_format_reward": 1.0, + "step": 1458, + "think_completion_length": 38.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.265625, + "epoch": 2.463743676222597, + "grad_norm": 4.796046963022677, + "kl": 0.666015625, + "learning_rate": 5.079258010118044e-07, + "loss": 0.0007, + "reward": 3.0368528366088867, + "reward_std": 0.05668491870164871, + "rewards/final_reward": 1.3625609609011828, + "rewards/mask_iou_reward": 0.6812804804505914, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.036852777004242, + "rewards/thk_ans_format_reward": 1.0, + "step": 1459, + "think_completion_length": 38.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.734375, + "epoch": 2.4654300168634062, + "grad_norm": 18.43196278631787, + "kl": 0.513671875, + "learning_rate": 5.075885328836425e-07, + "loss": 0.0005, + "reward": 3.3323365449905396, + "reward_std": 0.29067130386829376, + "rewards/final_reward": 1.0533541995261038, + "rewards/mask_iou_reward": 0.5266770997630519, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.34796142578125, + "rewards/thk_ans_format_reward": 1.0, + "step": 1460, + "think_completion_length": 37.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.375, + "epoch": 2.467116357504216, + "grad_norm": 27.616315775749797, + "kl": 0.705078125, + "learning_rate": 5.072512647554807e-07, + "loss": 0.0007, + "reward": 3.1079777479171753, + "reward_std": 0.08986812457442284, + "rewards/final_reward": 0.5474214879453962, + "rewards/mask_iou_reward": 0.2737107439726981, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.107977718114853, + "rewards/thk_ans_format_reward": 1.0, + "step": 1461, + "think_completion_length": 36.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.953125, + "epoch": 2.4688026981450255, + "grad_norm": 9.595931182246488, + "kl": 0.52734375, + "learning_rate": 5.069139966273187e-07, + "loss": 0.0005, + "reward": 3.1558161973953247, + "reward_std": 0.1381340161897242, + "rewards/final_reward": 1.4976028322847301, + "rewards/mask_iou_reward": 0.7488014161423651, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1558160781860352, + "rewards/thk_ans_format_reward": 1.0, + "step": 1462, + "think_completion_length": 47.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.5, + "epoch": 2.4704890387858347, + "grad_norm": 10.297934092004121, + "kl": 0.5390625, + "learning_rate": 5.065767284991567e-07, + "loss": 0.0005, + "reward": 2.827468156814575, + "reward_std": 0.3423341289162636, + "rewards/final_reward": 0.9297495915607341, + "rewards/mask_iou_reward": 0.46487479578036706, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.827468067407608, + "rewards/thk_ans_format_reward": 1.0, + "step": 1463, + "think_completion_length": 38.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.40625, + "epoch": 2.4721753794266443, + "grad_norm": 13.431336757603988, + "kl": 0.5859375, + "learning_rate": 5.062394603709949e-07, + "loss": 0.0006, + "reward": 3.1791683435440063, + "reward_std": 0.21104427706450224, + "rewards/final_reward": 1.347296892498644, + "rewards/mask_iou_reward": 0.673648446249322, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1791683435440063, + "rewards/thk_ans_format_reward": 1.0, + "step": 1464, + "think_completion_length": 41.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.453125, + "epoch": 2.4738617200674535, + "grad_norm": 6.4442907881178835, + "kl": 0.513671875, + "learning_rate": 5.05902192242833e-07, + "loss": 0.0005, + "reward": 3.7874099016189575, + "reward_std": 0.04841741733253002, + "rewards/final_reward": 1.7951469981212824, + "rewards/mask_iou_reward": 0.8975734990606412, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7874098420143127, + "rewards/thk_ans_format_reward": 1.0, + "step": 1465, + "think_completion_length": 40.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.3125, + "epoch": 2.475548060708263, + "grad_norm": 7.781073503408825, + "kl": 0.564453125, + "learning_rate": 5.055649241146711e-07, + "loss": 0.0006, + "reward": 3.5474064350128174, + "reward_std": 0.23158020619302988, + "rewards/final_reward": 1.626896546940197, + "rewards/mask_iou_reward": 0.8134482734700985, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5474063754081726, + "rewards/thk_ans_format_reward": 1.0, + "step": 1466, + "think_completion_length": 40.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.53125, + "epoch": 2.4772344013490724, + "grad_norm": 8.120456104997903, + "kl": 0.607421875, + "learning_rate": 5.052276559865093e-07, + "loss": 0.0006, + "reward": 3.499011993408203, + "reward_std": 0.10200574016198516, + "rewards/final_reward": 1.596909988674616, + "rewards/mask_iou_reward": 0.798454994337308, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4990119338035583, + "rewards/thk_ans_format_reward": 1.0, + "step": 1467, + "think_completion_length": 35.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.609375, + "epoch": 2.478920741989882, + "grad_norm": 10.586731590590368, + "kl": 0.58984375, + "learning_rate": 5.048903878583474e-07, + "loss": 0.0006, + "reward": 3.4600088596343994, + "reward_std": 0.19180525839328766, + "rewards/final_reward": 1.322128830048752, + "rewards/mask_iou_reward": 0.661064415024376, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4600088596343994, + "rewards/thk_ans_format_reward": 1.0, + "step": 1468, + "think_completion_length": 39.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.46875, + "epoch": 2.4806070826306916, + "grad_norm": 7.880859330095419, + "kl": 0.5625, + "learning_rate": 5.045531197301855e-07, + "loss": 0.0006, + "reward": 3.185870885848999, + "reward_std": 0.20582804456353188, + "rewards/final_reward": 1.714308307994534, + "rewards/mask_iou_reward": 0.857154153997267, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1858709752559662, + "rewards/thk_ans_format_reward": 1.0, + "step": 1469, + "think_completion_length": 41.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.90625, + "epoch": 2.482293423271501, + "grad_norm": 11.452435006601872, + "kl": 0.5546875, + "learning_rate": 5.042158516020237e-07, + "loss": 0.0006, + "reward": 3.056984066963196, + "reward_std": 0.2701308634132147, + "rewards/final_reward": 1.4174552743281663, + "rewards/mask_iou_reward": 0.7087276371640832, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0569840967655182, + "rewards/thk_ans_format_reward": 1.0, + "step": 1470, + "think_completion_length": 35.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.765625, + "epoch": 2.4839797639123105, + "grad_norm": 17.45227197751396, + "kl": 0.603515625, + "learning_rate": 5.038785834738617e-07, + "loss": 0.0006, + "reward": 3.5259718894958496, + "reward_std": 0.13498846907168627, + "rewards/final_reward": 1.3153849615984572, + "rewards/mask_iou_reward": 0.6576924807992286, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5259720087051392, + "rewards/thk_ans_format_reward": 1.0, + "step": 1471, + "think_completion_length": 40.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.8125, + "epoch": 2.4856661045531196, + "grad_norm": 5.423812455061472, + "kl": 0.71484375, + "learning_rate": 5.035413153456997e-07, + "loss": 0.0007, + "reward": 2.9699904918670654, + "reward_std": 0.07269694283604622, + "rewards/final_reward": 0.27860076343864043, + "rewards/mask_iou_reward": 0.13930038171932022, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9699905514717102, + "rewards/thk_ans_format_reward": 1.0, + "step": 1472, + "think_completion_length": 41.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.578125, + "epoch": 2.4873524451939293, + "grad_norm": 9.20306864688425, + "kl": 0.533203125, + "learning_rate": 5.032040472175379e-07, + "loss": 0.0005, + "reward": 3.86440372467041, + "reward_std": 0.0190952280536294, + "rewards/final_reward": 1.8322273764191868, + "rewards/mask_iou_reward": 0.9161136882095934, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8644036650657654, + "rewards/thk_ans_format_reward": 1.0, + "step": 1473, + "think_completion_length": 36.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.34375, + "epoch": 2.4890387858347385, + "grad_norm": 15.323120871459215, + "kl": 0.529296875, + "learning_rate": 5.02866779089376e-07, + "loss": 0.0005, + "reward": 2.5336424112319946, + "reward_std": 0.3594963401556015, + "rewards/final_reward": 0.48737288693574415, + "rewards/mask_iou_reward": 0.24368644346787208, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5336422324180603, + "rewards/thk_ans_format_reward": 1.0, + "step": 1474, + "think_completion_length": 44.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.390625, + "epoch": 2.490725126475548, + "grad_norm": 6.752980501040975, + "kl": 0.52734375, + "learning_rate": 5.025295109612141e-07, + "loss": 0.0006, + "reward": 3.02541720867157, + "reward_std": 0.1034752493724227, + "rewards/final_reward": 1.149960956271575, + "rewards/mask_iou_reward": 0.5749804781357875, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0254172086715698, + "rewards/thk_ans_format_reward": 1.0, + "step": 1475, + "think_completion_length": 41.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.390625, + "epoch": 2.4924114671163577, + "grad_norm": 222.0518474690732, + "kl": 0.4921875, + "learning_rate": 5.021922428330523e-07, + "loss": 0.0005, + "reward": 3.418384552001953, + "reward_std": 0.10731749702244997, + "rewards/final_reward": 1.6023031364795055, + "rewards/mask_iou_reward": 0.8011515682397528, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4183844923973083, + "rewards/thk_ans_format_reward": 1.0, + "step": 1476, + "think_completion_length": 42.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.265625, + "epoch": 2.494097807757167, + "grad_norm": 7.146648683338561, + "kl": 0.609375, + "learning_rate": 5.018549747048904e-07, + "loss": 0.0006, + "reward": 3.5869948863983154, + "reward_std": 0.0974464938044548, + "rewards/final_reward": 1.3416405430706282, + "rewards/mask_iou_reward": 0.6708202715353141, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5869948267936707, + "rewards/thk_ans_format_reward": 1.0, + "step": 1477, + "think_completion_length": 41.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.5, + "epoch": 2.4957841483979766, + "grad_norm": 5.366598932763867, + "kl": 0.4912109375, + "learning_rate": 5.015177065767285e-07, + "loss": 0.0005, + "reward": 3.449162483215332, + "reward_std": 0.04182947881054133, + "rewards/final_reward": 1.195586466228961, + "rewards/mask_iou_reward": 0.5977932331144805, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4491624236106873, + "rewards/thk_ans_format_reward": 1.0, + "step": 1478, + "think_completion_length": 43.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.796875, + "epoch": 2.4974704890387858, + "grad_norm": 5.736300280187392, + "kl": 0.62109375, + "learning_rate": 5.011804384485666e-07, + "loss": 0.0006, + "reward": 3.5285149812698364, + "reward_std": 0.28720738738775253, + "rewards/final_reward": 1.6930366481005783, + "rewards/mask_iou_reward": 0.8465183240502892, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5285149812698364, + "rewards/thk_ans_format_reward": 1.0, + "step": 1479, + "think_completion_length": 34.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.5625, + "epoch": 2.4991568296795954, + "grad_norm": 8.24736762704496, + "kl": 0.5224609375, + "learning_rate": 5.008431703204047e-07, + "loss": 0.0005, + "reward": 3.488741397857666, + "reward_std": 0.25808994472026825, + "rewards/final_reward": 1.59323068836111, + "rewards/mask_iou_reward": 0.796615344180555, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4887413382530212, + "rewards/thk_ans_format_reward": 1.0, + "step": 1480, + "think_completion_length": 35.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.6875, + "epoch": 2.5008431703204046, + "grad_norm": 11.027305372617139, + "kl": 0.50390625, + "learning_rate": 5.005059021922427e-07, + "loss": 0.0005, + "reward": 3.311052680015564, + "reward_std": 0.36544879525899887, + "rewards/final_reward": 1.7231988014742277, + "rewards/mask_iou_reward": 0.8615994007371138, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3110525608062744, + "rewards/thk_ans_format_reward": 1.0, + "step": 1481, + "think_completion_length": 42.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.65625, + "epoch": 2.5025295109612142, + "grad_norm": 11.319671035463822, + "kl": 0.626953125, + "learning_rate": 5.001686340640809e-07, + "loss": 0.0006, + "reward": 3.3623111248016357, + "reward_std": 0.09125454165041447, + "rewards/final_reward": 1.0992310112762569, + "rewards/mask_iou_reward": 0.5496155056381284, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3623109459877014, + "rewards/thk_ans_format_reward": 1.0, + "step": 1482, + "think_completion_length": 41.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.5, + "epoch": 2.504215851602024, + "grad_norm": 6.014732577915013, + "kl": 0.4951171875, + "learning_rate": 4.99831365935919e-07, + "loss": 0.0005, + "reward": 3.3078333139419556, + "reward_std": 0.4521195776760578, + "rewards/final_reward": 1.6163931838358212, + "rewards/mask_iou_reward": 0.8081965919179106, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 1.3703332543373108, + "rewards/thk_ans_format_reward": 0.96875, + "step": 1483, + "think_completion_length": 37.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.625, + "epoch": 2.505902192242833, + "grad_norm": 8.711154969838693, + "kl": 0.57421875, + "learning_rate": 4.994940978077571e-07, + "loss": 0.0006, + "reward": 3.4808534383773804, + "reward_std": 0.10041437298059464, + "rewards/final_reward": 1.5617287270725693, + "rewards/mask_iou_reward": 0.7808643635362846, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4808533787727356, + "rewards/thk_ans_format_reward": 1.0, + "step": 1484, + "think_completion_length": 33.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.921875, + "epoch": 2.5075885328836423, + "grad_norm": 6.4521826255397885, + "kl": 0.6279296875, + "learning_rate": 4.991568296795953e-07, + "loss": 0.0006, + "reward": 3.675198793411255, + "reward_std": 0.23655812442302704, + "rewards/final_reward": 1.8064039526925049, + "rewards/mask_iou_reward": 0.9032019763462524, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6751986742019653, + "rewards/thk_ans_format_reward": 1.0, + "step": 1485, + "think_completion_length": 41.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.109375, + "epoch": 2.509274873524452, + "grad_norm": 9.535230698250063, + "kl": 0.544921875, + "learning_rate": 4.988195615514334e-07, + "loss": 0.0005, + "reward": 3.2731704711914062, + "reward_std": 0.18547899648547173, + "rewards/final_reward": 1.2086966743957175, + "rewards/mask_iou_reward": 0.6043483371978587, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2731704115867615, + "rewards/thk_ans_format_reward": 1.0, + "step": 1486, + "think_completion_length": 35.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.640625, + "epoch": 2.5109612141652615, + "grad_norm": 12.694000418125398, + "kl": 0.5703125, + "learning_rate": 4.984822934232715e-07, + "loss": 0.0006, + "reward": 2.720350742340088, + "reward_std": 0.07947659306228161, + "rewards/final_reward": 1.0917999045996127, + "rewards/mask_iou_reward": 0.5458999522998064, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7203506827354431, + "rewards/thk_ans_format_reward": 1.0, + "step": 1487, + "think_completion_length": 41.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.96875, + "epoch": 2.5126475548060707, + "grad_norm": 10.410175507358746, + "kl": 0.58203125, + "learning_rate": 4.981450252951096e-07, + "loss": 0.0006, + "reward": 3.588701009750366, + "reward_std": 0.03587500285357237, + "rewards/final_reward": 1.8622233827402295, + "rewards/mask_iou_reward": 0.9311116913701147, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5887010097503662, + "rewards/thk_ans_format_reward": 1.0, + "step": 1488, + "think_completion_length": 36.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.625, + "epoch": 2.5143338954468804, + "grad_norm": 7.451733615106546, + "kl": 0.56640625, + "learning_rate": 4.978077571669478e-07, + "loss": 0.0006, + "reward": 3.5282169580459595, + "reward_std": 0.10562526807188988, + "rewards/final_reward": 1.3080303513620586, + "rewards/mask_iou_reward": 0.6540151756810293, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.528217077255249, + "rewards/thk_ans_format_reward": 1.0, + "step": 1489, + "think_completion_length": 40.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.34375, + "epoch": 2.51602023608769, + "grad_norm": 19.399719466274046, + "kl": 0.498046875, + "learning_rate": 4.974704890387858e-07, + "loss": 0.0005, + "reward": 3.088352680206299, + "reward_std": 0.4080119878053665, + "rewards/final_reward": 1.228325888967735, + "rewards/mask_iou_reward": 0.6141629444838675, + "rewards/sam_format_reward": 0.921875, + "rewards/sam_reward_func_ultra": 1.2446027398109436, + "rewards/thk_ans_format_reward": 0.921875, + "step": 1490, + "think_completion_length": 36.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.96875, + "epoch": 2.517706576728499, + "grad_norm": 7.137551160868289, + "kl": 2.7939453125, + "learning_rate": 4.971332209106239e-07, + "loss": 0.0028, + "reward": 3.375158429145813, + "reward_std": 0.16005902830511332, + "rewards/final_reward": 1.8587552642109353, + "rewards/mask_iou_reward": 0.9293776321054676, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.375158429145813, + "rewards/thk_ans_format_reward": 1.0, + "step": 1491, + "think_completion_length": 35.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.5, + "epoch": 2.5193929173693084, + "grad_norm": 8.313529358750277, + "kl": 0.46484375, + "learning_rate": 4.96795952782462e-07, + "loss": 0.0005, + "reward": 3.03733229637146, + "reward_std": 0.10004133731126785, + "rewards/final_reward": 1.4227331040090978, + "rewards/mask_iou_reward": 0.7113665520045489, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0373322367668152, + "rewards/thk_ans_format_reward": 1.0, + "step": 1492, + "think_completion_length": 37.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.296875, + "epoch": 2.521079258010118, + "grad_norm": 13.336282053939824, + "kl": 0.54296875, + "learning_rate": 4.964586846543001e-07, + "loss": 0.0005, + "reward": 3.2911131381988525, + "reward_std": 0.08773962408304214, + "rewards/final_reward": 0.6487437595690562, + "rewards/mask_iou_reward": 0.3243718797845281, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2911131978034973, + "rewards/thk_ans_format_reward": 1.0, + "step": 1493, + "think_completion_length": 35.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.28125, + "epoch": 2.5227655986509276, + "grad_norm": 37.03975936055251, + "kl": 0.5, + "learning_rate": 4.961214165261383e-07, + "loss": 0.0005, + "reward": 3.58980131149292, + "reward_std": 0.0956022769678384, + "rewards/final_reward": 1.7125394411309012, + "rewards/mask_iou_reward": 0.8562697205654506, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.58980131149292, + "rewards/thk_ans_format_reward": 1.0, + "step": 1494, + "think_completion_length": 40.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.640625, + "epoch": 2.524451939291737, + "grad_norm": 5.045301472235186, + "kl": 0.56640625, + "learning_rate": 4.957841483979764e-07, + "loss": 0.0006, + "reward": 2.850724458694458, + "reward_std": 0.1821054145693779, + "rewards/final_reward": 0.9959888150127159, + "rewards/mask_iou_reward": 0.49799440750635793, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8507243692874908, + "rewards/thk_ans_format_reward": 1.0, + "step": 1495, + "think_completion_length": 37.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.625, + "epoch": 2.5261382799325465, + "grad_norm": 14.382459957277339, + "kl": 0.5625, + "learning_rate": 4.954468802698145e-07, + "loss": 0.0006, + "reward": 2.9980448484420776, + "reward_std": 0.07024937309324741, + "rewards/final_reward": 1.1191838693528742, + "rewards/mask_iou_reward": 0.5595919346764371, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9980448931455612, + "rewards/thk_ans_format_reward": 1.0, + "step": 1496, + "think_completion_length": 38.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.359375, + "epoch": 2.5278246205733557, + "grad_norm": 7.46836121492322, + "kl": 0.513671875, + "learning_rate": 4.951096121416526e-07, + "loss": 0.0005, + "reward": 3.2769733667373657, + "reward_std": 0.15931928902864456, + "rewards/final_reward": 1.3874132598866995, + "rewards/mask_iou_reward": 0.6937066299433498, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2769732475280762, + "rewards/thk_ans_format_reward": 1.0, + "step": 1497, + "think_completion_length": 39.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.296875, + "epoch": 2.5295109612141653, + "grad_norm": 11.852359868027275, + "kl": 0.51953125, + "learning_rate": 4.947723440134908e-07, + "loss": 0.0005, + "reward": 3.484477162361145, + "reward_std": 0.2647605128586292, + "rewards/final_reward": 1.5508365032119544, + "rewards/mask_iou_reward": 0.7754182516059772, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.500102162361145, + "rewards/thk_ans_format_reward": 1.0, + "step": 1498, + "think_completion_length": 40.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.546875, + "epoch": 2.5311973018549745, + "grad_norm": 7.236240225814442, + "kl": 1.712890625, + "learning_rate": 4.944350758853287e-07, + "loss": 0.0017, + "reward": 3.281991481781006, + "reward_std": 0.09113920107483864, + "rewards/final_reward": 1.6733748077233552, + "rewards/mask_iou_reward": 0.8366874038616776, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2819915413856506, + "rewards/thk_ans_format_reward": 1.0, + "step": 1499, + "think_completion_length": 35.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.953125, + "epoch": 2.532883642495784, + "grad_norm": 5.106811671021872, + "kl": 0.529296875, + "learning_rate": 4.940978077571669e-07, + "loss": 0.0005, + "reward": 3.3747832775115967, + "reward_std": 0.03719430975615978, + "rewards/final_reward": 1.7466277158454178, + "rewards/mask_iou_reward": 0.8733138579227089, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3747830986976624, + "rewards/thk_ans_format_reward": 1.0, + "step": 1500, + "think_completion_length": 38.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.859375, + "epoch": 2.5345699831365938, + "grad_norm": 8.000033394139844, + "kl": 0.5205078125, + "learning_rate": 4.93760539629005e-07, + "loss": 0.0005, + "reward": 3.3949146270751953, + "reward_std": 0.06679772771894932, + "rewards/final_reward": 1.4510022893249355, + "rewards/mask_iou_reward": 0.7255011446624677, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3949146270751953, + "rewards/thk_ans_format_reward": 1.0, + "step": 1501, + "think_completion_length": 38.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.625, + "epoch": 2.536256323777403, + "grad_norm": 27.011764406123124, + "kl": 0.55859375, + "learning_rate": 4.934232715008432e-07, + "loss": 0.0006, + "reward": 3.6643035411834717, + "reward_std": 0.10874908417463303, + "rewards/final_reward": 1.6199177468343642, + "rewards/mask_iou_reward": 0.8099588734171821, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6643034219741821, + "rewards/thk_ans_format_reward": 1.0, + "step": 1502, + "think_completion_length": 40.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.015625, + "epoch": 2.5379426644182126, + "grad_norm": 11.198242643547353, + "kl": 0.54296875, + "learning_rate": 4.930860033726813e-07, + "loss": 0.0005, + "reward": 3.4217268228530884, + "reward_std": 0.14082890190184116, + "rewards/final_reward": 1.3744126428382355, + "rewards/mask_iou_reward": 0.6872063214191177, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4217267632484436, + "rewards/thk_ans_format_reward": 1.0, + "step": 1503, + "think_completion_length": 41.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.921875, + "epoch": 2.539629005059022, + "grad_norm": 6.024812195755251, + "kl": 1.859375, + "learning_rate": 4.927487352445194e-07, + "loss": 0.0019, + "reward": 3.6555745601654053, + "reward_std": 0.021429577842354774, + "rewards/final_reward": 1.4500117782519568, + "rewards/mask_iou_reward": 0.7250058891259784, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6555745005607605, + "rewards/thk_ans_format_reward": 1.0, + "step": 1504, + "think_completion_length": 40.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.796875, + "epoch": 2.5413153456998314, + "grad_norm": 4.938521081033098, + "kl": 0.560546875, + "learning_rate": 4.924114671163575e-07, + "loss": 0.0006, + "reward": 3.088452100753784, + "reward_std": 0.11477963626384735, + "rewards/final_reward": 1.2085818779067128, + "rewards/mask_iou_reward": 0.6042909389533564, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0884520709514618, + "rewards/thk_ans_format_reward": 1.0, + "step": 1505, + "think_completion_length": 37.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.5625, + "epoch": 2.5430016863406406, + "grad_norm": 23.17355611185932, + "kl": 0.5234375, + "learning_rate": 4.920741989881956e-07, + "loss": 0.0005, + "reward": 3.434250831604004, + "reward_std": 0.0526156984269619, + "rewards/final_reward": 1.5713436340782292, + "rewards/mask_iou_reward": 0.7856718170391146, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4342508912086487, + "rewards/thk_ans_format_reward": 1.0, + "step": 1506, + "think_completion_length": 42.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.625, + "epoch": 2.5446880269814502, + "grad_norm": 4.824502597592859, + "kl": 0.431640625, + "learning_rate": 4.917369308600338e-07, + "loss": 0.0004, + "reward": 3.3531363010406494, + "reward_std": 0.09069814160466194, + "rewards/final_reward": 1.1147132374340087, + "rewards/mask_iou_reward": 0.5573566187170044, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3531363010406494, + "rewards/thk_ans_format_reward": 1.0, + "step": 1507, + "think_completion_length": 47.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.515625, + "epoch": 2.54637436762226, + "grad_norm": 9.57819569491857, + "kl": 0.546875, + "learning_rate": 4.913996627318718e-07, + "loss": 0.0006, + "reward": 3.4678783416748047, + "reward_std": 0.16838806122541428, + "rewards/final_reward": 1.2330647677428894, + "rewards/mask_iou_reward": 0.6165323838714447, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4678783416748047, + "rewards/thk_ans_format_reward": 1.0, + "step": 1508, + "think_completion_length": 37.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.796875, + "epoch": 2.548060708263069, + "grad_norm": 5.676967471760433, + "kl": 0.533203125, + "learning_rate": 4.910623946037099e-07, + "loss": 0.0006, + "reward": 3.423642158508301, + "reward_std": 0.09652687440393493, + "rewards/final_reward": 1.5992528204709278, + "rewards/mask_iou_reward": 0.7996264102354639, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4236420392990112, + "rewards/thk_ans_format_reward": 1.0, + "step": 1509, + "think_completion_length": 40.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.890625, + "epoch": 2.5497470489038787, + "grad_norm": 7.789483168092156, + "kl": 0.513671875, + "learning_rate": 4.90725126475548e-07, + "loss": 0.0005, + "reward": 2.3797736167907715, + "reward_std": 0.11256012320518494, + "rewards/final_reward": 0.4328186320145556, + "rewards/mask_iou_reward": 0.2164093160072778, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.3797735422849655, + "rewards/thk_ans_format_reward": 1.0, + "step": 1510, + "think_completion_length": 42.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.03125, + "epoch": 2.551433389544688, + "grad_norm": 9.713157031940664, + "kl": 0.544921875, + "learning_rate": 4.903878583473862e-07, + "loss": 0.0005, + "reward": 3.414551854133606, + "reward_std": 0.11701996996998787, + "rewards/final_reward": 1.3789813631599346, + "rewards/mask_iou_reward": 0.6894906815799673, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4145516157150269, + "rewards/thk_ans_format_reward": 1.0, + "step": 1511, + "think_completion_length": 40.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.28125, + "epoch": 2.5531197301854975, + "grad_norm": 5.514874996478179, + "kl": 0.58203125, + "learning_rate": 4.900505902192242e-07, + "loss": 0.0006, + "reward": 3.4243216514587402, + "reward_std": 0.0784766897559166, + "rewards/final_reward": 1.890977303160391, + "rewards/mask_iou_reward": 0.9454886515801955, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4243216514587402, + "rewards/thk_ans_format_reward": 1.0, + "step": 1512, + "think_completion_length": 42.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.234375, + "epoch": 2.5548060708263067, + "grad_norm": 8.96725315369348, + "kl": 0.513671875, + "learning_rate": 4.897133220910624e-07, + "loss": 0.0005, + "reward": 2.880871295928955, + "reward_std": 0.15390251949429512, + "rewards/final_reward": 1.1972393384262139, + "rewards/mask_iou_reward": 0.5986196692131069, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8808711767196655, + "rewards/thk_ans_format_reward": 1.0, + "step": 1513, + "think_completion_length": 42.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.71875, + "epoch": 2.5564924114671164, + "grad_norm": 27.2516380009568, + "kl": 0.517578125, + "learning_rate": 4.893760539629005e-07, + "loss": 0.0005, + "reward": 3.5386931896209717, + "reward_std": 0.22735736519098282, + "rewards/final_reward": 1.276440365439687, + "rewards/mask_iou_reward": 0.6382201827198435, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.538693130016327, + "rewards/thk_ans_format_reward": 1.0, + "step": 1514, + "think_completion_length": 48.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.671875, + "epoch": 2.558178752107926, + "grad_norm": 5.983763326608007, + "kl": 0.529296875, + "learning_rate": 4.890387858347387e-07, + "loss": 0.0005, + "reward": 3.8019983768463135, + "reward_std": 0.02921892609447241, + "rewards/final_reward": 1.868028471346339, + "rewards/mask_iou_reward": 0.9340142356731695, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8019983768463135, + "rewards/thk_ans_format_reward": 1.0, + "step": 1515, + "think_completion_length": 38.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.34375, + "epoch": 2.559865092748735, + "grad_norm": 4.5608997779574585, + "kl": 0.48046875, + "learning_rate": 4.887015177065766e-07, + "loss": 0.0005, + "reward": 2.8760156631469727, + "reward_std": 0.414703406393528, + "rewards/final_reward": 0.7626393288297652, + "rewards/mask_iou_reward": 0.3813196644148826, + "rewards/sam_format_reward": 0.9375, + "rewards/sam_reward_func_ultra": 1.0010156631469727, + "rewards/thk_ans_format_reward": 0.9375, + "step": 1516, + "think_completion_length": 47.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.015625, + "epoch": 2.561551433389545, + "grad_norm": 9.563988407014383, + "kl": 0.494140625, + "learning_rate": 4.883642495784148e-07, + "loss": 0.0005, + "reward": 3.6301556825637817, + "reward_std": 0.5168076306581497, + "rewards/final_reward": 1.6550659131921477, + "rewards/mask_iou_reward": 0.8275329565960738, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.630155622959137, + "rewards/thk_ans_format_reward": 1.0, + "step": 1517, + "think_completion_length": 49.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.515625, + "epoch": 2.563237774030354, + "grad_norm": 6.757332895940087, + "kl": 0.4267578125, + "learning_rate": 4.880269814502529e-07, + "loss": 0.0004, + "reward": 3.1780283451080322, + "reward_std": 0.18703092634677887, + "rewards/final_reward": 1.1283806928614566, + "rewards/mask_iou_reward": 0.5641903464307283, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1780283451080322, + "rewards/thk_ans_format_reward": 1.0, + "step": 1518, + "think_completion_length": 41.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.140625, + "epoch": 2.5649241146711637, + "grad_norm": 6.581563060182763, + "kl": 0.841796875, + "learning_rate": 4.87689713322091e-07, + "loss": 0.0008, + "reward": 3.018182158470154, + "reward_std": 0.04332828428596258, + "rewards/final_reward": 1.3776301205739845, + "rewards/mask_iou_reward": 0.6888150602869922, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.018182247877121, + "rewards/thk_ans_format_reward": 1.0, + "step": 1519, + "think_completion_length": 41.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.21875, + "epoch": 2.566610455311973, + "grad_norm": 16.043120163718434, + "kl": 0.482421875, + "learning_rate": 4.873524451939291e-07, + "loss": 0.0005, + "reward": 3.086033821105957, + "reward_std": 0.15922314673662186, + "rewards/final_reward": 0.9320970069593832, + "rewards/mask_iou_reward": 0.4660485034796916, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.086033821105957, + "rewards/thk_ans_format_reward": 1.0, + "step": 1520, + "think_completion_length": 45.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.71875, + "epoch": 2.5682967959527825, + "grad_norm": 12.811754975192569, + "kl": 0.568359375, + "learning_rate": 4.870151770657673e-07, + "loss": 0.0006, + "reward": 3.313421607017517, + "reward_std": 0.12244333326816559, + "rewards/final_reward": 1.017356395968409, + "rewards/mask_iou_reward": 0.5086781979842045, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3134216666221619, + "rewards/thk_ans_format_reward": 1.0, + "step": 1521, + "think_completion_length": 43.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.015625, + "epoch": 2.569983136593592, + "grad_norm": 6.3633128568114, + "kl": 0.580078125, + "learning_rate": 4.866779089376054e-07, + "loss": 0.0006, + "reward": 3.3072917461395264, + "reward_std": 0.11184610333293676, + "rewards/final_reward": 1.2002857379851997, + "rewards/mask_iou_reward": 0.6001428689925998, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.307291865348816, + "rewards/thk_ans_format_reward": 1.0, + "step": 1522, + "think_completion_length": 51.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.359375, + "epoch": 2.5716694772344013, + "grad_norm": 20.475153625295707, + "kl": 0.458984375, + "learning_rate": 4.863406408094435e-07, + "loss": 0.0004, + "reward": 3.3745312690734863, + "reward_std": 0.2294555902481079, + "rewards/final_reward": 1.0695021371042701, + "rewards/mask_iou_reward": 0.5347510685521351, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.374531239271164, + "rewards/thk_ans_format_reward": 1.0, + "step": 1523, + "think_completion_length": 44.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.390625, + "epoch": 2.573355817875211, + "grad_norm": 6.926455621132243, + "kl": 0.4404296875, + "learning_rate": 4.860033726812816e-07, + "loss": 0.0004, + "reward": 3.157252073287964, + "reward_std": 0.29762740433216095, + "rewards/final_reward": 0.802155203549531, + "rewards/mask_iou_reward": 0.4010776017747655, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1572520732879639, + "rewards/thk_ans_format_reward": 1.0, + "step": 1524, + "think_completion_length": 46.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.765625, + "epoch": 2.57504215851602, + "grad_norm": 5.606031237944716, + "kl": 0.47265625, + "learning_rate": 4.856661045531196e-07, + "loss": 0.0005, + "reward": 3.2091275453567505, + "reward_std": 0.10631818510591984, + "rewards/final_reward": 1.401239787273532, + "rewards/mask_iou_reward": 0.700619893636766, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2091275751590729, + "rewards/thk_ans_format_reward": 1.0, + "step": 1525, + "think_completion_length": 44.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.0625, + "epoch": 2.5767284991568298, + "grad_norm": 18.99856368894485, + "kl": 0.4482421875, + "learning_rate": 4.853288364249578e-07, + "loss": 0.0004, + "reward": 3.006584405899048, + "reward_std": 0.3390379399061203, + "rewards/final_reward": 0.8887834962915174, + "rewards/mask_iou_reward": 0.4443917481457587, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0065844655036926, + "rewards/thk_ans_format_reward": 1.0, + "step": 1526, + "think_completion_length": 46.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.46875, + "epoch": 2.578414839797639, + "grad_norm": 10.312057039994464, + "kl": 0.5625, + "learning_rate": 4.849915682967959e-07, + "loss": 0.0006, + "reward": 3.5264012813568115, + "reward_std": 0.06957072392106056, + "rewards/final_reward": 1.8601035590132144, + "rewards/mask_iou_reward": 0.9300517795066072, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5264012813568115, + "rewards/thk_ans_format_reward": 1.0, + "step": 1527, + "think_completion_length": 41.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.3125, + "epoch": 2.5801011804384486, + "grad_norm": 6.768274528540181, + "kl": 0.576171875, + "learning_rate": 4.846543001686341e-07, + "loss": 0.0006, + "reward": 3.2123255729675293, + "reward_std": 0.16551712527871132, + "rewards/final_reward": 1.4633281006480252, + "rewards/mask_iou_reward": 0.7316640503240126, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2123255133628845, + "rewards/thk_ans_format_reward": 1.0, + "step": 1528, + "think_completion_length": 47.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.59375, + "epoch": 2.5817875210792582, + "grad_norm": 4.944611934648343, + "kl": 0.583984375, + "learning_rate": 4.843170320404721e-07, + "loss": 0.0006, + "reward": 3.326672673225403, + "reward_std": 0.0943008842295967, + "rewards/final_reward": 0.8625031017373519, + "rewards/mask_iou_reward": 0.43125155086867595, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3266727328300476, + "rewards/thk_ans_format_reward": 1.0, + "step": 1529, + "think_completion_length": 45.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.5, + "epoch": 2.5834738617200674, + "grad_norm": 7.237087579186458, + "kl": 0.484375, + "learning_rate": 4.839797639123103e-07, + "loss": 0.0005, + "reward": 3.3302892446517944, + "reward_std": 0.2636425644159317, + "rewards/final_reward": 1.2618188810091784, + "rewards/mask_iou_reward": 0.6309094405045892, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3302891850471497, + "rewards/thk_ans_format_reward": 1.0, + "step": 1530, + "think_completion_length": 46.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.765625, + "epoch": 2.5851602023608766, + "grad_norm": 12.82926820626442, + "kl": 0.4873046875, + "learning_rate": 4.836424957841484e-07, + "loss": 0.0005, + "reward": 3.2265427112579346, + "reward_std": 0.19642452150583267, + "rewards/final_reward": 1.2173636238875956, + "rewards/mask_iou_reward": 0.6086818119437978, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2265428006649017, + "rewards/thk_ans_format_reward": 1.0, + "step": 1531, + "think_completion_length": 47.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.015625, + "epoch": 2.5868465430016863, + "grad_norm": 10.453234101933612, + "kl": 0.51953125, + "learning_rate": 4.833052276559865e-07, + "loss": 0.0005, + "reward": 2.6653435230255127, + "reward_std": 0.0757587868720293, + "rewards/final_reward": 1.1625929683638556, + "rewards/mask_iou_reward": 0.5812964841819278, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6653436124324799, + "rewards/thk_ans_format_reward": 1.0, + "step": 1532, + "think_completion_length": 42.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.328125, + "epoch": 2.588532883642496, + "grad_norm": 6.342628822025022, + "kl": 0.556640625, + "learning_rate": 4.829679595278246e-07, + "loss": 0.0006, + "reward": 3.794961452484131, + "reward_std": 0.13813296146690845, + "rewards/final_reward": 1.9120891559917732, + "rewards/mask_iou_reward": 0.9560445779958866, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.794961392879486, + "rewards/thk_ans_format_reward": 1.0, + "step": 1533, + "think_completion_length": 46.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.578125, + "epoch": 2.590219224283305, + "grad_norm": 18.23118607933623, + "kl": 0.515625, + "learning_rate": 4.826306913996627e-07, + "loss": 0.0005, + "reward": 2.9826323986053467, + "reward_std": 0.03328784089535475, + "rewards/final_reward": 0.8880378820892472, + "rewards/mask_iou_reward": 0.4440189410446236, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9826323986053467, + "rewards/thk_ans_format_reward": 1.0, + "step": 1534, + "think_completion_length": 42.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.09375, + "epoch": 2.5919055649241147, + "grad_norm": 8.61276484430537, + "kl": 0.5537109375, + "learning_rate": 4.822934232715008e-07, + "loss": 0.0006, + "reward": 3.336413264274597, + "reward_std": 0.17770569026470184, + "rewards/final_reward": 1.638049519197422, + "rewards/mask_iou_reward": 0.819024759598711, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3364134430885315, + "rewards/thk_ans_format_reward": 1.0, + "step": 1535, + "think_completion_length": 46.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.140625, + "epoch": 2.5935919055649244, + "grad_norm": 5.034058915635746, + "kl": 0.533203125, + "learning_rate": 4.819561551433389e-07, + "loss": 0.0005, + "reward": 3.8958654403686523, + "reward_std": 0.021186801604926586, + "rewards/final_reward": 1.8879820680199333, + "rewards/mask_iou_reward": 0.9439910340099666, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.895865261554718, + "rewards/thk_ans_format_reward": 1.0, + "step": 1536, + "think_completion_length": 43.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.234375, + "epoch": 2.5952782462057336, + "grad_norm": 12.630529991131136, + "kl": 0.55859375, + "learning_rate": 4.81618887015177e-07, + "loss": 0.0006, + "reward": 3.3030004501342773, + "reward_std": 0.10049432516098022, + "rewards/final_reward": 1.3787357369782953, + "rewards/mask_iou_reward": 0.6893678684891477, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.303000271320343, + "rewards/thk_ans_format_reward": 1.0, + "step": 1537, + "think_completion_length": 45.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.359375, + "epoch": 2.5969645868465427, + "grad_norm": 13.931755040748557, + "kl": 0.49609375, + "learning_rate": 4.812816188870151e-07, + "loss": 0.0005, + "reward": 3.333008289337158, + "reward_std": 0.15168678015470505, + "rewards/final_reward": 1.42218525400646, + "rewards/mask_iou_reward": 0.71109262700323, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3330082297325134, + "rewards/thk_ans_format_reward": 1.0, + "step": 1538, + "think_completion_length": 45.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.0625, + "epoch": 2.5986509274873524, + "grad_norm": 5.2649389912705375, + "kl": 0.5078125, + "learning_rate": 4.809443507588533e-07, + "loss": 0.0005, + "reward": 2.6697299480438232, + "reward_std": 0.0926759373396635, + "rewards/final_reward": 0.5165016829960553, + "rewards/mask_iou_reward": 0.25825084149802763, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6697298586368561, + "rewards/thk_ans_format_reward": 1.0, + "step": 1539, + "think_completion_length": 47.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.328125, + "epoch": 2.600337268128162, + "grad_norm": 21.037030317936612, + "kl": 0.55859375, + "learning_rate": 4.806070826306914e-07, + "loss": 0.0006, + "reward": 3.2079389095306396, + "reward_std": 0.12378528714179993, + "rewards/final_reward": 1.4359067824635598, + "rewards/mask_iou_reward": 0.7179533912317799, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2079389095306396, + "rewards/thk_ans_format_reward": 1.0, + "step": 1540, + "think_completion_length": 43.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.59375, + "epoch": 2.602023608768971, + "grad_norm": 10.794107348698743, + "kl": 0.470703125, + "learning_rate": 4.802698145025295e-07, + "loss": 0.0005, + "reward": 3.417840003967285, + "reward_std": 0.06528156064450741, + "rewards/final_reward": 1.9286261272195278, + "rewards/mask_iou_reward": 0.9643130636097639, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4178398847579956, + "rewards/thk_ans_format_reward": 1.0, + "step": 1541, + "think_completion_length": 46.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.15625, + "epoch": 2.603709949409781, + "grad_norm": 7.865836908975941, + "kl": 0.564453125, + "learning_rate": 4.799325463743676e-07, + "loss": 0.0007, + "reward": 3.137349486351013, + "reward_std": 0.04046285804361105, + "rewards/final_reward": 0.8131963840540475, + "rewards/mask_iou_reward": 0.40659819202702374, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1373494267463684, + "rewards/thk_ans_format_reward": 1.0, + "step": 1542, + "think_completion_length": 49.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.640625, + "epoch": 2.6053962900505905, + "grad_norm": 5.886858445716339, + "kl": 0.61328125, + "learning_rate": 4.795952782462057e-07, + "loss": 0.0006, + "reward": 3.5436571836471558, + "reward_std": 0.3002474457025528, + "rewards/final_reward": 1.548718318359077, + "rewards/mask_iou_reward": 0.7743591591795385, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5436573028564453, + "rewards/thk_ans_format_reward": 1.0, + "step": 1543, + "think_completion_length": 44.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.84375, + "epoch": 2.6070826306913997, + "grad_norm": 10.253609083824529, + "kl": 0.53125, + "learning_rate": 4.792580101180438e-07, + "loss": 0.0005, + "reward": 3.101767063140869, + "reward_std": 0.3000176250934601, + "rewards/final_reward": 1.0033951506957848, + "rewards/mask_iou_reward": 0.5016975753478924, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1017670631408691, + "rewards/thk_ans_format_reward": 1.0, + "step": 1544, + "think_completion_length": 46.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.75, + "epoch": 2.608768971332209, + "grad_norm": 9.753023875173408, + "kl": 0.55859375, + "learning_rate": 4.789207419898819e-07, + "loss": 0.0006, + "reward": 3.387078046798706, + "reward_std": 0.01357703935354948, + "rewards/final_reward": 1.0556593821705356, + "rewards/mask_iou_reward": 0.5278296910852678, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3870778679847717, + "rewards/thk_ans_format_reward": 1.0, + "step": 1545, + "think_completion_length": 42.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.140625, + "epoch": 2.6104553119730185, + "grad_norm": 75.76666417873618, + "kl": 0.478515625, + "learning_rate": 4.7858347386172e-07, + "loss": 0.0004, + "reward": 3.6486486196517944, + "reward_std": 0.2635802363511175, + "rewards/final_reward": 1.7562827520153421, + "rewards/mask_iou_reward": 0.8781413760076711, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6486486196517944, + "rewards/thk_ans_format_reward": 1.0, + "step": 1546, + "think_completion_length": 46.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.53125, + "epoch": 2.612141652613828, + "grad_norm": 5.692902016421105, + "kl": 0.474609375, + "learning_rate": 4.782462057335582e-07, + "loss": 0.0005, + "reward": 3.7076621055603027, + "reward_std": 0.046445537358522415, + "rewards/final_reward": 1.5944065581111615, + "rewards/mask_iou_reward": 0.7972032790555807, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7076621055603027, + "rewards/thk_ans_format_reward": 1.0, + "step": 1547, + "think_completion_length": 41.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.15625, + "epoch": 2.6138279932546373, + "grad_norm": 4.7331556470796725, + "kl": 0.53515625, + "learning_rate": 4.779089376053963e-07, + "loss": 0.0005, + "reward": 2.840194821357727, + "reward_std": 0.17273178696632385, + "rewards/final_reward": 0.8641808244057804, + "rewards/mask_iou_reward": 0.4320904122028902, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.840194795280695, + "rewards/thk_ans_format_reward": 1.0, + "step": 1548, + "think_completion_length": 46.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.5625, + "epoch": 2.615514333895447, + "grad_norm": 6.113780727882999, + "kl": 0.544921875, + "learning_rate": 4.775716694772344e-07, + "loss": 0.0006, + "reward": 3.314778447151184, + "reward_std": 0.10576976649463177, + "rewards/final_reward": 1.2631982577774228, + "rewards/mask_iou_reward": 0.6315991288887114, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3147783279418945, + "rewards/thk_ans_format_reward": 1.0, + "step": 1549, + "think_completion_length": 49.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.59375, + "epoch": 2.6172006745362566, + "grad_norm": 14.729443816776524, + "kl": 0.51171875, + "learning_rate": 4.772344013490725e-07, + "loss": 0.0005, + "reward": 3.392350196838379, + "reward_std": 0.02547481842339039, + "rewards/final_reward": 1.917889459453028, + "rewards/mask_iou_reward": 0.958944729726514, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3923503756523132, + "rewards/thk_ans_format_reward": 1.0, + "step": 1550, + "think_completion_length": 46.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.453125, + "epoch": 2.618887015177066, + "grad_norm": 23.575749523820498, + "kl": 0.4873046875, + "learning_rate": 4.768971332209106e-07, + "loss": 0.0005, + "reward": 3.108386278152466, + "reward_std": 0.16571337264031172, + "rewards/final_reward": 0.7428194446241151, + "rewards/mask_iou_reward": 0.37140972231205754, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1083862483501434, + "rewards/thk_ans_format_reward": 1.0, + "step": 1551, + "think_completion_length": 45.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.59375, + "epoch": 2.620573355817875, + "grad_norm": 25.729924296933714, + "kl": 0.4970703125, + "learning_rate": 4.765598650927487e-07, + "loss": 0.0005, + "reward": 3.8241816759109497, + "reward_std": 0.2826864686794579, + "rewards/final_reward": 1.802022132728133, + "rewards/mask_iou_reward": 0.9010110663640665, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.8398066759109497, + "rewards/thk_ans_format_reward": 1.0, + "step": 1552, + "think_completion_length": 44.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.515625, + "epoch": 2.6222596964586846, + "grad_norm": 11.691282009828335, + "kl": 0.4853515625, + "learning_rate": 4.7622259696458683e-07, + "loss": 0.0005, + "reward": 3.5969111919403076, + "reward_std": 0.09428023174405098, + "rewards/final_reward": 1.7918991579440622, + "rewards/mask_iou_reward": 0.8959495789720311, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5969111919403076, + "rewards/thk_ans_format_reward": 1.0, + "step": 1553, + "think_completion_length": 43.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.234375, + "epoch": 2.6239460370994943, + "grad_norm": 6.5807467921403, + "kl": 0.533203125, + "learning_rate": 4.7588532883642497e-07, + "loss": 0.0005, + "reward": 3.04490065574646, + "reward_std": 0.21727034822106361, + "rewards/final_reward": 1.1348485780762427, + "rewards/mask_iou_reward": 0.5674242890381214, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0449006259441376, + "rewards/thk_ans_format_reward": 1.0, + "step": 1554, + "think_completion_length": 48.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.71875, + "epoch": 2.6256323777403034, + "grad_norm": 16.10521079538983, + "kl": 0.623046875, + "learning_rate": 4.75548060708263e-07, + "loss": 0.0006, + "reward": 3.658421516418457, + "reward_std": 0.16007404774427414, + "rewards/final_reward": 1.7699895470810052, + "rewards/mask_iou_reward": 0.8849947735405026, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6584214568138123, + "rewards/thk_ans_format_reward": 1.0, + "step": 1555, + "think_completion_length": 42.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.78125, + "epoch": 2.627318718381113, + "grad_norm": 42.81121603125669, + "kl": 0.548828125, + "learning_rate": 4.7521079258010115e-07, + "loss": 0.0006, + "reward": 3.4277206659317017, + "reward_std": 0.03460780787281692, + "rewards/final_reward": 1.1004128676198721, + "rewards/mask_iou_reward": 0.5502064338099361, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4277206063270569, + "rewards/thk_ans_format_reward": 1.0, + "step": 1556, + "think_completion_length": 41.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.96875, + "epoch": 2.6290050590219223, + "grad_norm": 11.764793983624923, + "kl": 0.54296875, + "learning_rate": 4.748735244519393e-07, + "loss": 0.0005, + "reward": 3.1046032905578613, + "reward_std": 0.1467623095959425, + "rewards/final_reward": 1.4726183202278702, + "rewards/mask_iou_reward": 0.7363091601139351, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.104603350162506, + "rewards/thk_ans_format_reward": 1.0, + "step": 1557, + "think_completion_length": 37.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.03125, + "epoch": 2.630691399662732, + "grad_norm": 7.374056984850983, + "kl": 0.521484375, + "learning_rate": 4.745362563237774e-07, + "loss": 0.0005, + "reward": 3.222964286804199, + "reward_std": 0.09663717821240425, + "rewards/final_reward": 1.8764069665929193, + "rewards/mask_iou_reward": 0.9382034832964596, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.222964346408844, + "rewards/thk_ans_format_reward": 1.0, + "step": 1558, + "think_completion_length": 42.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.90625, + "epoch": 2.632377740303541, + "grad_norm": 8.212332172856842, + "kl": 0.47265625, + "learning_rate": 4.741989881956155e-07, + "loss": 0.0005, + "reward": 3.1506274938583374, + "reward_std": 0.13635646551847458, + "rewards/final_reward": 0.6150793550844958, + "rewards/mask_iou_reward": 0.3075396775422479, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1506274938583374, + "rewards/thk_ans_format_reward": 1.0, + "step": 1559, + "think_completion_length": 45.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.71875, + "epoch": 2.6340640809443507, + "grad_norm": 10.737499573640928, + "kl": 0.501953125, + "learning_rate": 4.738617200674536e-07, + "loss": 0.0005, + "reward": 3.4245011806488037, + "reward_std": 0.24318117648363113, + "rewards/final_reward": 1.8178093915501898, + "rewards/mask_iou_reward": 0.9089046957750949, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.424501121044159, + "rewards/thk_ans_format_reward": 1.0, + "step": 1560, + "think_completion_length": 50.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.5, + "epoch": 2.6357504215851604, + "grad_norm": 9.811740705111555, + "kl": 0.53125, + "learning_rate": 4.735244519392917e-07, + "loss": 0.0005, + "reward": 3.083121180534363, + "reward_std": 0.17737470380961895, + "rewards/final_reward": 0.7628667234987518, + "rewards/mask_iou_reward": 0.3814333617493759, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0831211805343628, + "rewards/thk_ans_format_reward": 1.0, + "step": 1561, + "think_completion_length": 45.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.046875, + "epoch": 2.6374367622259696, + "grad_norm": 17.93304553748424, + "kl": 0.55078125, + "learning_rate": 4.7318718381112983e-07, + "loss": 0.0006, + "reward": 3.072922706604004, + "reward_std": 0.24106465280056, + "rewards/final_reward": 1.264943024137027, + "rewards/mask_iou_reward": 0.6324715120685135, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0729226768016815, + "rewards/thk_ans_format_reward": 1.0, + "step": 1562, + "think_completion_length": 42.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.15625, + "epoch": 2.639123102866779, + "grad_norm": 16.148344624678465, + "kl": 0.46875, + "learning_rate": 4.7284991568296797e-07, + "loss": 0.0005, + "reward": 3.426049590110779, + "reward_std": 0.09079774469137192, + "rewards/final_reward": 1.5492462119283235, + "rewards/mask_iou_reward": 0.7746231059641617, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.426049828529358, + "rewards/thk_ans_format_reward": 1.0, + "step": 1563, + "think_completion_length": 38.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.984375, + "epoch": 2.6408094435075884, + "grad_norm": 10.99108255679532, + "kl": 0.4794921875, + "learning_rate": 4.7251264755480606e-07, + "loss": 0.0005, + "reward": 3.2649797201156616, + "reward_std": 0.2973283752799034, + "rewards/final_reward": 0.6567315942814922, + "rewards/mask_iou_reward": 0.3283657971407461, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2649796605110168, + "rewards/thk_ans_format_reward": 1.0, + "step": 1564, + "think_completion_length": 46.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.5625, + "epoch": 2.642495784148398, + "grad_norm": 9.148717957849211, + "kl": 0.537109375, + "learning_rate": 4.7217537942664415e-07, + "loss": 0.0005, + "reward": 3.450225353240967, + "reward_std": 0.32389168441295624, + "rewards/final_reward": 1.584107655751557, + "rewards/mask_iou_reward": 0.7920538278757785, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4502254724502563, + "rewards/thk_ans_format_reward": 1.0, + "step": 1565, + "think_completion_length": 45.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.140625, + "epoch": 2.6441821247892072, + "grad_norm": 10.15309035210223, + "kl": 0.466796875, + "learning_rate": 4.718381112984823e-07, + "loss": 0.0005, + "reward": 3.3438801765441895, + "reward_std": 0.15512818098068237, + "rewards/final_reward": 1.4235350129170727, + "rewards/mask_iou_reward": 0.7117675064585364, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3438800573349, + "rewards/thk_ans_format_reward": 1.0, + "step": 1566, + "think_completion_length": 43.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.640625, + "epoch": 2.645868465430017, + "grad_norm": 34.60246296479256, + "kl": 0.4931640625, + "learning_rate": 4.715008431703204e-07, + "loss": 0.0005, + "reward": 2.9925897121429443, + "reward_std": 0.13927190750837326, + "rewards/final_reward": 1.2736035743438077, + "rewards/mask_iou_reward": 0.6368017871719038, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.992589682340622, + "rewards/thk_ans_format_reward": 1.0, + "step": 1567, + "think_completion_length": 40.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.375, + "epoch": 2.6475548060708265, + "grad_norm": 13.797768570000349, + "kl": 0.59375, + "learning_rate": 4.7116357504215846e-07, + "loss": 0.0006, + "reward": 2.680661916732788, + "reward_std": 0.19025126099586487, + "rewards/final_reward": 0.9316987866921573, + "rewards/mask_iou_reward": 0.46584939334607867, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6806618869304657, + "rewards/thk_ans_format_reward": 1.0, + "step": 1568, + "think_completion_length": 39.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.890625, + "epoch": 2.6492411467116357, + "grad_norm": 6.704404939031757, + "kl": 0.5263671875, + "learning_rate": 4.708263069139966e-07, + "loss": 0.0005, + "reward": 3.408167600631714, + "reward_std": 0.08674982748925686, + "rewards/final_reward": 1.8698519267907314, + "rewards/mask_iou_reward": 0.9349259633953657, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4081675112247467, + "rewards/thk_ans_format_reward": 1.0, + "step": 1569, + "think_completion_length": 40.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.375, + "epoch": 2.6509274873524453, + "grad_norm": 67.84706890476959, + "kl": 0.529296875, + "learning_rate": 4.7048903878583474e-07, + "loss": 0.0005, + "reward": 3.11447811126709, + "reward_std": 0.1034508217126131, + "rewards/final_reward": 1.031368616121662, + "rewards/mask_iou_reward": 0.515684308060831, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1144780814647675, + "rewards/thk_ans_format_reward": 1.0, + "step": 1570, + "think_completion_length": 42.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.5, + "epoch": 2.6526138279932545, + "grad_norm": 11.334062103906817, + "kl": 0.482421875, + "learning_rate": 4.7015177065767283e-07, + "loss": 0.0005, + "reward": 3.4597952365875244, + "reward_std": 0.190800953656435, + "rewards/final_reward": 1.5933513720971386, + "rewards/mask_iou_reward": 0.7966756860485693, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4597952961921692, + "rewards/thk_ans_format_reward": 1.0, + "step": 1571, + "think_completion_length": 42.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.40625, + "epoch": 2.654300168634064, + "grad_norm": 9.567327556714062, + "kl": 0.52734375, + "learning_rate": 4.698145025295109e-07, + "loss": 0.0005, + "reward": 3.682429313659668, + "reward_std": 0.03618870349600911, + "rewards/final_reward": 1.642017994065745, + "rewards/mask_iou_reward": 0.8210089970328724, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6824292540550232, + "rewards/thk_ans_format_reward": 1.0, + "step": 1572, + "think_completion_length": 44.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.296875, + "epoch": 2.6559865092748733, + "grad_norm": 6.993710088151398, + "kl": 0.546875, + "learning_rate": 4.6947723440134906e-07, + "loss": 0.0006, + "reward": 3.35558819770813, + "reward_std": 0.07169377896934748, + "rewards/final_reward": 1.0683089924040468, + "rewards/mask_iou_reward": 0.5341544962020234, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.355588138103485, + "rewards/thk_ans_format_reward": 1.0, + "step": 1573, + "think_completion_length": 46.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.625, + "epoch": 2.657672849915683, + "grad_norm": 10.336596094703028, + "kl": 0.5546875, + "learning_rate": 4.6913996627318714e-07, + "loss": 0.0006, + "reward": 2.707545042037964, + "reward_std": 0.15012376755475998, + "rewards/final_reward": 1.2545923639349246, + "rewards/mask_iou_reward": 0.6272961819674623, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7075448632240295, + "rewards/thk_ans_format_reward": 1.0, + "step": 1574, + "think_completion_length": 46.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.59375, + "epoch": 2.6593591905564926, + "grad_norm": 7.323434814783891, + "kl": 0.58984375, + "learning_rate": 4.688026981450253e-07, + "loss": 0.0006, + "reward": 3.3268707990646362, + "reward_std": 0.1611822471022606, + "rewards/final_reward": 1.0528655309199106, + "rewards/mask_iou_reward": 0.5264327654599553, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3268707990646362, + "rewards/thk_ans_format_reward": 1.0, + "step": 1575, + "think_completion_length": 40.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.703125, + "epoch": 2.661045531197302, + "grad_norm": 6.051668551380872, + "kl": 0.537109375, + "learning_rate": 4.6846543001686337e-07, + "loss": 0.0005, + "reward": 3.360478639602661, + "reward_std": 0.021059296559542418, + "rewards/final_reward": 0.9446825785933939, + "rewards/mask_iou_reward": 0.47234128929669694, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3604785799980164, + "rewards/thk_ans_format_reward": 1.0, + "step": 1576, + "think_completion_length": 45.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.625, + "epoch": 2.6627318718381114, + "grad_norm": 9.767147695822144, + "kl": 0.52734375, + "learning_rate": 4.681281618887015e-07, + "loss": 0.0005, + "reward": 3.1824584007263184, + "reward_std": 0.02653807308524847, + "rewards/final_reward": 0.8726776630599029, + "rewards/mask_iou_reward": 0.43633883152995145, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1824583113193512, + "rewards/thk_ans_format_reward": 1.0, + "step": 1577, + "think_completion_length": 43.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.734375, + "epoch": 2.6644182124789206, + "grad_norm": 6.890106456107886, + "kl": 0.6171875, + "learning_rate": 4.677908937605396e-07, + "loss": 0.0006, + "reward": 3.0945212841033936, + "reward_std": 0.07227480411529541, + "rewards/final_reward": 0.6768194410453567, + "rewards/mask_iou_reward": 0.33840972052267837, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0945212841033936, + "rewards/thk_ans_format_reward": 1.0, + "step": 1578, + "think_completion_length": 40.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.421875, + "epoch": 2.6661045531197303, + "grad_norm": 11.178278259713263, + "kl": 0.54296875, + "learning_rate": 4.6745362563237774e-07, + "loss": 0.0006, + "reward": 2.717486262321472, + "reward_std": 0.08585362508893013, + "rewards/final_reward": 1.0023938367623468, + "rewards/mask_iou_reward": 0.5011969183811734, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7174861431121826, + "rewards/thk_ans_format_reward": 1.0, + "step": 1579, + "think_completion_length": 36.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.09375, + "epoch": 2.6677908937605395, + "grad_norm": 7.912862704188623, + "kl": 0.4765625, + "learning_rate": 4.6711635750421583e-07, + "loss": 0.0005, + "reward": 3.138826847076416, + "reward_std": 0.030409451574087143, + "rewards/final_reward": 1.1384120289812303, + "rewards/mask_iou_reward": 0.5692060144906151, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1388267874717712, + "rewards/thk_ans_format_reward": 1.0, + "step": 1580, + "think_completion_length": 37.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.796875, + "epoch": 2.669477234401349, + "grad_norm": 17.902177939828203, + "kl": 0.541015625, + "learning_rate": 4.667790893760539e-07, + "loss": 0.0005, + "reward": 2.9601058959960938, + "reward_std": 0.08017378486692905, + "rewards/final_reward": 0.8861616697236607, + "rewards/mask_iou_reward": 0.44308083486183036, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9601059556007385, + "rewards/thk_ans_format_reward": 1.0, + "step": 1581, + "think_completion_length": 41.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.6875, + "epoch": 2.6711635750421587, + "grad_norm": 5.77744769275141, + "kl": 0.53125, + "learning_rate": 4.6644182124789205e-07, + "loss": 0.0006, + "reward": 3.140070676803589, + "reward_std": 0.02819753671064973, + "rewards/final_reward": 1.1835882399632247, + "rewards/mask_iou_reward": 0.5917941199816124, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1400706768035889, + "rewards/thk_ans_format_reward": 1.0, + "step": 1582, + "think_completion_length": 43.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.65625, + "epoch": 2.672849915682968, + "grad_norm": 9.166588439312482, + "kl": 0.5078125, + "learning_rate": 4.661045531197302e-07, + "loss": 0.0005, + "reward": 3.6381969451904297, + "reward_std": 0.0838024877011776, + "rewards/final_reward": 1.8922678160683175, + "rewards/mask_iou_reward": 0.9461339080341588, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6381970047950745, + "rewards/thk_ans_format_reward": 1.0, + "step": 1583, + "think_completion_length": 43.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.140625, + "epoch": 2.6745362563237776, + "grad_norm": 17.661711314451015, + "kl": 0.51953125, + "learning_rate": 4.6576728499156823e-07, + "loss": 0.0005, + "reward": 3.1295218467712402, + "reward_std": 0.04637359641492367, + "rewards/final_reward": 1.2038188705486585, + "rewards/mask_iou_reward": 0.6019094352743293, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1295219659805298, + "rewards/thk_ans_format_reward": 1.0, + "step": 1584, + "think_completion_length": 46.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.46875, + "epoch": 2.6762225969645868, + "grad_norm": 6.201859830852071, + "kl": 0.5673828125, + "learning_rate": 4.6543001686340637e-07, + "loss": 0.0006, + "reward": 3.565206527709961, + "reward_std": 0.22164902091026306, + "rewards/final_reward": 1.2804556381149486, + "rewards/mask_iou_reward": 0.6402278190574743, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.565206527709961, + "rewards/thk_ans_format_reward": 1.0, + "step": 1585, + "think_completion_length": 44.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.15625, + "epoch": 2.6779089376053964, + "grad_norm": 12.381435115250927, + "kl": 0.54296875, + "learning_rate": 4.650927487352445e-07, + "loss": 0.0005, + "reward": 3.632646679878235, + "reward_std": 0.09525941498577595, + "rewards/final_reward": 1.9017620673478721, + "rewards/mask_iou_reward": 0.9508810336739361, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6326467394828796, + "rewards/thk_ans_format_reward": 1.0, + "step": 1586, + "think_completion_length": 44.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.984375, + "epoch": 2.6795952782462056, + "grad_norm": 7.556322139294845, + "kl": 0.69921875, + "learning_rate": 4.647554806070826e-07, + "loss": 0.0007, + "reward": 2.895167589187622, + "reward_std": 0.04426950961351395, + "rewards/final_reward": 1.3905354281615103, + "rewards/mask_iou_reward": 0.6952677140807552, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.895167738199234, + "rewards/thk_ans_format_reward": 1.0, + "step": 1587, + "think_completion_length": 41.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.453125, + "epoch": 2.681281618887015, + "grad_norm": 5.349816511244455, + "kl": 0.5625, + "learning_rate": 4.6441821247892074e-07, + "loss": 0.0006, + "reward": 3.0216450691223145, + "reward_std": 0.18535812944173813, + "rewards/final_reward": 1.1056129172841695, + "rewards/mask_iou_reward": 0.5528064586420848, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0216450691223145, + "rewards/thk_ans_format_reward": 1.0, + "step": 1588, + "think_completion_length": 46.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.265625, + "epoch": 2.682967959527825, + "grad_norm": 8.617515731302568, + "kl": 0.5234375, + "learning_rate": 4.640809443507588e-07, + "loss": 0.0005, + "reward": 2.9087109565734863, + "reward_std": 0.029049073811620474, + "rewards/final_reward": 0.6989310824756831, + "rewards/mask_iou_reward": 0.34946554123784157, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9087110161781311, + "rewards/thk_ans_format_reward": 1.0, + "step": 1589, + "think_completion_length": 46.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.53125, + "epoch": 2.684654300168634, + "grad_norm": 6.251207462853013, + "kl": 0.54296875, + "learning_rate": 4.6374367622259697e-07, + "loss": 0.0005, + "reward": 3.040684938430786, + "reward_std": 0.2383800894021988, + "rewards/final_reward": 1.0378911364396055, + "rewards/mask_iou_reward": 0.5189455682198028, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0406849384307861, + "rewards/thk_ans_format_reward": 1.0, + "step": 1590, + "think_completion_length": 45.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.609375, + "epoch": 2.6863406408094432, + "grad_norm": 14.302390776245351, + "kl": 0.51171875, + "learning_rate": 4.6340640809443505e-07, + "loss": 0.0005, + "reward": 3.3956209421157837, + "reward_std": 0.08094323147088289, + "rewards/final_reward": 1.1523773228814034, + "rewards/mask_iou_reward": 0.5761886614407017, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3956209421157837, + "rewards/thk_ans_format_reward": 1.0, + "step": 1591, + "think_completion_length": 38.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.59375, + "epoch": 2.688026981450253, + "grad_norm": 10.327695067737794, + "kl": 0.51953125, + "learning_rate": 4.630691399662732e-07, + "loss": 0.0005, + "reward": 3.3911901712417603, + "reward_std": 0.015237356536090374, + "rewards/final_reward": 1.8107498385321739, + "rewards/mask_iou_reward": 0.9053749192660869, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.391190081834793, + "rewards/thk_ans_format_reward": 1.0, + "step": 1592, + "think_completion_length": 44.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.640625, + "epoch": 2.6897133220910625, + "grad_norm": 4.515188568828064, + "kl": 0.564453125, + "learning_rate": 4.627318718381113e-07, + "loss": 0.0005, + "reward": 3.6611695289611816, + "reward_std": 0.010300178895704448, + "rewards/final_reward": 1.5551834657420902, + "rewards/mask_iou_reward": 0.7775917328710451, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6611695885658264, + "rewards/thk_ans_format_reward": 1.0, + "step": 1593, + "think_completion_length": 45.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.0625, + "epoch": 2.6913996627318717, + "grad_norm": 9.967779718750911, + "kl": 0.5546875, + "learning_rate": 4.6239460370994937e-07, + "loss": 0.0005, + "reward": 3.6303428411483765, + "reward_std": 0.13057681638747454, + "rewards/final_reward": 1.3867047517088262, + "rewards/mask_iou_reward": 0.6933523758544131, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6303430199623108, + "rewards/thk_ans_format_reward": 1.0, + "step": 1594, + "think_completion_length": 46.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.328125, + "epoch": 2.6930860033726813, + "grad_norm": 7.092355396036124, + "kl": 0.5126953125, + "learning_rate": 4.620573355817875e-07, + "loss": 0.0005, + "reward": 3.195076107978821, + "reward_std": 0.20519106090068817, + "rewards/final_reward": 0.6965408540739686, + "rewards/mask_iou_reward": 0.3482704270369843, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1950761079788208, + "rewards/thk_ans_format_reward": 1.0, + "step": 1595, + "think_completion_length": 39.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.40625, + "epoch": 2.694772344013491, + "grad_norm": 5.681443841176162, + "kl": 0.666015625, + "learning_rate": 4.6172006745362565e-07, + "loss": 0.0007, + "reward": 3.520709276199341, + "reward_std": 0.10630087740719318, + "rewards/final_reward": 1.224463961360859, + "rewards/mask_iou_reward": 0.6122319806804295, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5207092761993408, + "rewards/thk_ans_format_reward": 1.0, + "step": 1596, + "think_completion_length": 43.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.1875, + "epoch": 2.6964586846543, + "grad_norm": 26.572470268498073, + "kl": 0.591796875, + "learning_rate": 4.613827993254637e-07, + "loss": 0.0006, + "reward": 3.334014058113098, + "reward_std": 0.11482397792860866, + "rewards/final_reward": 1.4160978681431455, + "rewards/mask_iou_reward": 0.7080489340715728, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.334014117717743, + "rewards/thk_ans_format_reward": 1.0, + "step": 1597, + "think_completion_length": 48.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.609375, + "epoch": 2.6981450252951094, + "grad_norm": 20.336568028235305, + "kl": 0.556640625, + "learning_rate": 4.610455311973018e-07, + "loss": 0.0006, + "reward": 3.5459563732147217, + "reward_std": 0.05063344561494887, + "rewards/final_reward": 1.6638033763740356, + "rewards/mask_iou_reward": 0.8319016881870178, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.545956313610077, + "rewards/thk_ans_format_reward": 1.0, + "step": 1598, + "think_completion_length": 46.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.640625, + "epoch": 2.699831365935919, + "grad_norm": 6.445917565428539, + "kl": 0.626953125, + "learning_rate": 4.6070826306913996e-07, + "loss": 0.0006, + "reward": 3.5455654859542847, + "reward_std": 0.15113668888807297, + "rewards/final_reward": 1.9026466968239584, + "rewards/mask_iou_reward": 0.9513233484119792, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.545565664768219, + "rewards/thk_ans_format_reward": 1.0, + "step": 1599, + "think_completion_length": 41.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.125, + "epoch": 2.7015177065767286, + "grad_norm": 9.59769819423389, + "kl": 0.494140625, + "learning_rate": 4.6037099494097805e-07, + "loss": 0.0005, + "reward": 3.7451967000961304, + "reward_std": 0.15041500329971313, + "rewards/final_reward": 1.5559414480541731, + "rewards/mask_iou_reward": 0.7779707240270866, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.74519681930542, + "rewards/thk_ans_format_reward": 1.0, + "step": 1600, + "think_completion_length": 48.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.28125, + "epoch": 2.703204047217538, + "grad_norm": 46.21311513255855, + "kl": 0.552734375, + "learning_rate": 4.6003372681281614e-07, + "loss": 0.0005, + "reward": 2.6555440425872803, + "reward_std": 0.13194021955132484, + "rewards/final_reward": 1.1682005930727084, + "rewards/mask_iou_reward": 0.5841002965363542, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.655544102191925, + "rewards/thk_ans_format_reward": 1.0, + "step": 1601, + "think_completion_length": 44.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.625, + "epoch": 2.7048903878583475, + "grad_norm": 11.008089396246493, + "kl": 0.5478515625, + "learning_rate": 4.596964586846543e-07, + "loss": 0.0005, + "reward": 3.157100558280945, + "reward_std": 0.1843406707048416, + "rewards/final_reward": 1.3733449605594634, + "rewards/mask_iou_reward": 0.6866724802797317, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1571004986763, + "rewards/thk_ans_format_reward": 1.0, + "step": 1602, + "think_completion_length": 42.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.5625, + "epoch": 2.706576728499157, + "grad_norm": 6.757045349435166, + "kl": 0.58203125, + "learning_rate": 4.593591905564924e-07, + "loss": 0.0006, + "reward": 3.094280958175659, + "reward_std": 0.06427431292831898, + "rewards/final_reward": 0.8938852405476458, + "rewards/mask_iou_reward": 0.4469426202738229, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0942809730768204, + "rewards/thk_ans_format_reward": 1.0, + "step": 1603, + "think_completion_length": 44.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.296875, + "epoch": 2.7082630691399663, + "grad_norm": 6.581458185650574, + "kl": 0.560546875, + "learning_rate": 4.590219224283305e-07, + "loss": 0.0006, + "reward": 3.599763035774231, + "reward_std": 0.09732984844595194, + "rewards/final_reward": 1.4477912477637431, + "rewards/mask_iou_reward": 0.7238956238818716, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5997629761695862, + "rewards/thk_ans_format_reward": 1.0, + "step": 1604, + "think_completion_length": 38.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.890625, + "epoch": 2.7099494097807755, + "grad_norm": 8.227960755990702, + "kl": 0.48828125, + "learning_rate": 4.586846543001686e-07, + "loss": 0.0005, + "reward": 3.294048309326172, + "reward_std": 0.19548258185386658, + "rewards/final_reward": 1.0749916780010045, + "rewards/mask_iou_reward": 0.5374958390005022, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.294048249721527, + "rewards/thk_ans_format_reward": 1.0, + "step": 1605, + "think_completion_length": 45.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.65625, + "epoch": 2.711635750421585, + "grad_norm": 13.268237610277964, + "kl": 0.4794921875, + "learning_rate": 4.5834738617200673e-07, + "loss": 0.0005, + "reward": 3.5636035203933716, + "reward_std": 0.07341088191606104, + "rewards/final_reward": 1.6283072965909686, + "rewards/mask_iou_reward": 0.8141536482954843, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.563603401184082, + "rewards/thk_ans_format_reward": 1.0, + "step": 1606, + "think_completion_length": 46.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.171875, + "epoch": 2.7133220910623947, + "grad_norm": 11.23746727458345, + "kl": 0.509765625, + "learning_rate": 4.580101180438448e-07, + "loss": 0.0005, + "reward": 2.89646053314209, + "reward_std": 0.2256901040673256, + "rewards/final_reward": 1.0191730065477667, + "rewards/mask_iou_reward": 0.5095865032738833, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8964604437351227, + "rewards/thk_ans_format_reward": 1.0, + "step": 1607, + "think_completion_length": 45.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.734375, + "epoch": 2.715008431703204, + "grad_norm": 11.332571852572475, + "kl": 0.671875, + "learning_rate": 4.5767284991568296e-07, + "loss": 0.0007, + "reward": 2.9931583404541016, + "reward_std": 0.16069792211055756, + "rewards/final_reward": 0.6567371198659422, + "rewards/mask_iou_reward": 0.3283685599329711, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9931585192680359, + "rewards/thk_ans_format_reward": 1.0, + "step": 1608, + "think_completion_length": 46.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.78125, + "epoch": 2.7166947723440136, + "grad_norm": 20.350092315088784, + "kl": 0.6484375, + "learning_rate": 4.5733558178752105e-07, + "loss": 0.0006, + "reward": 3.3594272136688232, + "reward_std": 0.049404000863432884, + "rewards/final_reward": 1.0703260453871735, + "rewards/mask_iou_reward": 0.5351630226935867, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3594273328781128, + "rewards/thk_ans_format_reward": 1.0, + "step": 1609, + "think_completion_length": 41.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.359375, + "epoch": 2.718381112984823, + "grad_norm": 11.614624529345251, + "kl": 0.541015625, + "learning_rate": 4.5699831365935914e-07, + "loss": 0.0005, + "reward": 3.3027396202087402, + "reward_std": 0.12427800334990025, + "rewards/final_reward": 1.7234073373638734, + "rewards/mask_iou_reward": 0.8617036686819367, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3027397394180298, + "rewards/thk_ans_format_reward": 1.0, + "step": 1610, + "think_completion_length": 45.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.546875, + "epoch": 2.7200674536256324, + "grad_norm": 6.353380378963483, + "kl": 0.505859375, + "learning_rate": 4.566610455311973e-07, + "loss": 0.0005, + "reward": 3.120621681213379, + "reward_std": 0.2408496029675007, + "rewards/final_reward": 1.1250674417261752, + "rewards/mask_iou_reward": 0.5625337208630876, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.12062169611454, + "rewards/thk_ans_format_reward": 1.0, + "step": 1611, + "think_completion_length": 51.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.203125, + "epoch": 2.7217537942664416, + "grad_norm": 4.3232466439197115, + "kl": 0.5234375, + "learning_rate": 4.563237774030354e-07, + "loss": 0.0005, + "reward": 2.8183436393737793, + "reward_std": 0.019991028821095824, + "rewards/final_reward": 0.6046746524383444, + "rewards/mask_iou_reward": 0.3023373262191722, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8183438181877136, + "rewards/thk_ans_format_reward": 1.0, + "step": 1612, + "think_completion_length": 44.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.046875, + "epoch": 2.7234401349072512, + "grad_norm": 6.560569368733555, + "kl": 0.466796875, + "learning_rate": 4.5598650927487345e-07, + "loss": 0.0005, + "reward": 3.509775757789612, + "reward_std": 0.020152635872364044, + "rewards/final_reward": 1.8637907902287016, + "rewards/mask_iou_reward": 0.9318953951143508, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5097758769989014, + "rewards/thk_ans_format_reward": 1.0, + "step": 1613, + "think_completion_length": 37.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.9375, + "epoch": 2.725126475548061, + "grad_norm": 29.48749989127377, + "kl": 0.4638671875, + "learning_rate": 4.556492411467116e-07, + "loss": 0.0005, + "reward": 3.187331199645996, + "reward_std": 0.3615667298436165, + "rewards/final_reward": 0.7844147741617697, + "rewards/mask_iou_reward": 0.3922073870808849, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1873311400413513, + "rewards/thk_ans_format_reward": 1.0, + "step": 1614, + "think_completion_length": 54.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.359375, + "epoch": 2.72681281618887, + "grad_norm": 7.455100685242957, + "kl": 0.595703125, + "learning_rate": 4.5531197301854973e-07, + "loss": 0.0006, + "reward": 3.0750458240509033, + "reward_std": 0.1224330198019743, + "rewards/final_reward": 0.8567996114915193, + "rewards/mask_iou_reward": 0.42839980574575964, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0750460028648376, + "rewards/thk_ans_format_reward": 1.0, + "step": 1615, + "think_completion_length": 39.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.140625, + "epoch": 2.7284991568296797, + "grad_norm": 617.4205143388256, + "kl": 1.263671875, + "learning_rate": 4.5497470489038787e-07, + "loss": 0.0013, + "reward": 3.344548225402832, + "reward_std": 0.13941496424376965, + "rewards/final_reward": 1.4527047944685108, + "rewards/mask_iou_reward": 0.7263523972342554, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3445482850074768, + "rewards/thk_ans_format_reward": 1.0, + "step": 1616, + "think_completion_length": 39.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.28125, + "epoch": 2.730185497470489, + "grad_norm": 5.885694035409707, + "kl": 0.53125, + "learning_rate": 4.5463743676222596e-07, + "loss": 0.0005, + "reward": 3.196670651435852, + "reward_std": 0.0907645896077156, + "rewards/final_reward": 1.4354340157265555, + "rewards/mask_iou_reward": 0.7177170078632777, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1966705918312073, + "rewards/thk_ans_format_reward": 1.0, + "step": 1617, + "think_completion_length": 47.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.9375, + "epoch": 2.7318718381112985, + "grad_norm": 11.887463663477943, + "kl": 1.025390625, + "learning_rate": 4.5430016863406405e-07, + "loss": 0.001, + "reward": 3.2308313846588135, + "reward_std": 0.17940062656998634, + "rewards/final_reward": 1.219334363125589, + "rewards/mask_iou_reward": 0.6096671815627945, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.230831354856491, + "rewards/thk_ans_format_reward": 1.0, + "step": 1618, + "think_completion_length": 48.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.15625, + "epoch": 2.7335581787521077, + "grad_norm": 5.53371551554527, + "kl": 0.58203125, + "learning_rate": 4.539629005059022e-07, + "loss": 0.0006, + "reward": 3.69223952293396, + "reward_std": 0.06898763962090015, + "rewards/final_reward": 1.6759169513149303, + "rewards/mask_iou_reward": 0.8379584756574652, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.69223952293396, + "rewards/thk_ans_format_reward": 1.0, + "step": 1619, + "think_completion_length": 43.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.1875, + "epoch": 2.7352445193929174, + "grad_norm": 18.416952301844045, + "kl": 0.517578125, + "learning_rate": 4.536256323777403e-07, + "loss": 0.0005, + "reward": 3.3278738260269165, + "reward_std": 0.1171044334769249, + "rewards/final_reward": 0.9361255570934764, + "rewards/mask_iou_reward": 0.4680627785467382, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3278738260269165, + "rewards/thk_ans_format_reward": 1.0, + "step": 1620, + "think_completion_length": 42.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.1875, + "epoch": 2.736930860033727, + "grad_norm": 10.515013413730191, + "kl": 0.5546875, + "learning_rate": 4.532883642495784e-07, + "loss": 0.0006, + "reward": 3.530287265777588, + "reward_std": 0.21374469250440598, + "rewards/final_reward": 1.4084952224038, + "rewards/mask_iou_reward": 0.7042476112019, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5302872061729431, + "rewards/thk_ans_format_reward": 1.0, + "step": 1621, + "think_completion_length": 45.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.359375, + "epoch": 2.738617200674536, + "grad_norm": 10.898962457623472, + "kl": 0.537109375, + "learning_rate": 4.529510961214165e-07, + "loss": 0.0005, + "reward": 3.6671559810638428, + "reward_std": 0.06463497970253229, + "rewards/final_reward": 1.7504511102408151, + "rewards/mask_iou_reward": 0.8752255551204076, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6671560406684875, + "rewards/thk_ans_format_reward": 1.0, + "step": 1622, + "think_completion_length": 40.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.34375, + "epoch": 2.740303541315346, + "grad_norm": 8.048926304189164, + "kl": 0.5205078125, + "learning_rate": 4.526138279932546e-07, + "loss": 0.0005, + "reward": 3.449007511138916, + "reward_std": 0.1426835972815752, + "rewards/final_reward": 1.6894874620838656, + "rewards/mask_iou_reward": 0.8447437310419328, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.449007511138916, + "rewards/thk_ans_format_reward": 1.0, + "step": 1623, + "think_completion_length": 48.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.046875, + "epoch": 2.741989881956155, + "grad_norm": 6.472851906577908, + "kl": 0.564453125, + "learning_rate": 4.5227655986509273e-07, + "loss": 0.0006, + "reward": 3.6451568603515625, + "reward_std": 0.2593112513422966, + "rewards/final_reward": 1.4224126006269602, + "rewards/mask_iou_reward": 0.7112063003134801, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.660781741142273, + "rewards/thk_ans_format_reward": 1.0, + "step": 1624, + "think_completion_length": 42.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.140625, + "epoch": 2.7436762225969646, + "grad_norm": 10.19352056567603, + "kl": 0.51953125, + "learning_rate": 4.5193929173693087e-07, + "loss": 0.0005, + "reward": 3.6311769485473633, + "reward_std": 0.016571541782468557, + "rewards/final_reward": 1.8216684584209872, + "rewards/mask_iou_reward": 0.9108342292104936, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6311771273612976, + "rewards/thk_ans_format_reward": 1.0, + "step": 1625, + "think_completion_length": 43.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.890625, + "epoch": 2.745362563237774, + "grad_norm": 7.9582142441158545, + "kl": 0.5234375, + "learning_rate": 4.5160202360876896e-07, + "loss": 0.0005, + "reward": 3.2092288732528687, + "reward_std": 0.1798749640583992, + "rewards/final_reward": 1.476282555361136, + "rewards/mask_iou_reward": 0.738141277680568, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2092288732528687, + "rewards/thk_ans_format_reward": 1.0, + "step": 1626, + "think_completion_length": 45.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.203125, + "epoch": 2.7470489038785835, + "grad_norm": 10.413865750913033, + "kl": 0.564453125, + "learning_rate": 4.5126475548060705e-07, + "loss": 0.0006, + "reward": 3.249502658843994, + "reward_std": 0.116750568151474, + "rewards/final_reward": 1.4793633630563305, + "rewards/mask_iou_reward": 0.7396816815281653, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2495025396347046, + "rewards/thk_ans_format_reward": 1.0, + "step": 1627, + "think_completion_length": 41.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.78125, + "epoch": 2.748735244519393, + "grad_norm": 14.680895975342848, + "kl": 0.509765625, + "learning_rate": 4.509274873524452e-07, + "loss": 0.0005, + "reward": 3.1441562175750732, + "reward_std": 0.0737368743866682, + "rewards/final_reward": 1.3852608388558878, + "rewards/mask_iou_reward": 0.6926304194279439, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1441562175750732, + "rewards/thk_ans_format_reward": 1.0, + "step": 1628, + "think_completion_length": 38.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.296875, + "epoch": 2.7504215851602023, + "grad_norm": 9.618197850292306, + "kl": 0.625, + "learning_rate": 4.5059021922428333e-07, + "loss": 0.0006, + "reward": 3.2445462942123413, + "reward_std": 0.2222440093755722, + "rewards/final_reward": 1.501265198198892, + "rewards/mask_iou_reward": 0.750632599099446, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2445462942123413, + "rewards/thk_ans_format_reward": 1.0, + "step": 1629, + "think_completion_length": 36.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.71875, + "epoch": 2.752107925801012, + "grad_norm": 8.92522300504253, + "kl": 0.56640625, + "learning_rate": 4.5025295109612136e-07, + "loss": 0.0006, + "reward": 3.412382483482361, + "reward_std": 0.32697246968746185, + "rewards/final_reward": 1.2967872210805327, + "rewards/mask_iou_reward": 0.6483936105402663, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.4280074834823608, + "rewards/thk_ans_format_reward": 1.0, + "step": 1630, + "think_completion_length": 46.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.578125, + "epoch": 2.753794266441821, + "grad_norm": 9.728732780329596, + "kl": 0.541015625, + "learning_rate": 4.499156829679595e-07, + "loss": 0.0004, + "reward": 3.712798237800598, + "reward_std": 0.14024843752849847, + "rewards/final_reward": 1.5205530420294848, + "rewards/mask_iou_reward": 0.7602765210147424, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7127981185913086, + "rewards/thk_ans_format_reward": 1.0, + "step": 1631, + "think_completion_length": 45.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.484375, + "epoch": 2.7554806070826308, + "grad_norm": 55.711069059492566, + "kl": 0.548828125, + "learning_rate": 4.4957841483979764e-07, + "loss": 0.0005, + "reward": 3.454300045967102, + "reward_std": 0.052289645187556744, + "rewards/final_reward": 1.872316458665721, + "rewards/mask_iou_reward": 0.9361582293328605, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4542999267578125, + "rewards/thk_ans_format_reward": 1.0, + "step": 1632, + "think_completion_length": 45.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.703125, + "epoch": 2.75716694772344, + "grad_norm": 12.41641458988822, + "kl": 0.572265625, + "learning_rate": 4.4924114671163573e-07, + "loss": 0.0006, + "reward": 3.2182466983795166, + "reward_std": 0.08807303197681904, + "rewards/final_reward": 0.8138171199638627, + "rewards/mask_iou_reward": 0.4069085599819314, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2182466089725494, + "rewards/thk_ans_format_reward": 1.0, + "step": 1633, + "think_completion_length": 39.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.21875, + "epoch": 2.7588532883642496, + "grad_norm": 5.889717675375059, + "kl": 0.552734375, + "learning_rate": 4.489038785834738e-07, + "loss": 0.0006, + "reward": 2.974811315536499, + "reward_std": 0.06643060594797134, + "rewards/final_reward": 1.0631971480457183, + "rewards/mask_iou_reward": 0.5315985740228591, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9748111963272095, + "rewards/thk_ans_format_reward": 1.0, + "step": 1634, + "think_completion_length": 46.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.203125, + "epoch": 2.7605396290050592, + "grad_norm": 14.3730869784043, + "kl": 0.513671875, + "learning_rate": 4.4856661045531196e-07, + "loss": 0.0005, + "reward": 3.477365493774414, + "reward_std": 0.3136833906173706, + "rewards/final_reward": 1.4492890341658577, + "rewards/mask_iou_reward": 0.7246445170829289, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.477365493774414, + "rewards/thk_ans_format_reward": 1.0, + "step": 1635, + "think_completion_length": 48.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.0, + "epoch": 2.7622259696458684, + "grad_norm": 9.270564936349368, + "kl": 0.564453125, + "learning_rate": 4.4822934232715004e-07, + "loss": 0.0006, + "reward": 2.9075610637664795, + "reward_std": 0.13323557563126087, + "rewards/final_reward": 1.5394845428329393, + "rewards/mask_iou_reward": 0.7697422714164697, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9075611233711243, + "rewards/thk_ans_format_reward": 1.0, + "step": 1636, + "think_completion_length": 48.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.859375, + "epoch": 2.763912310286678, + "grad_norm": 9.59060155091405, + "kl": 0.55859375, + "learning_rate": 4.478920741989882e-07, + "loss": 0.0006, + "reward": 3.169069766998291, + "reward_std": 0.24465776793658733, + "rewards/final_reward": 1.1316411755875047, + "rewards/mask_iou_reward": 0.5658205877937523, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.169069766998291, + "rewards/thk_ans_format_reward": 1.0, + "step": 1637, + "think_completion_length": 45.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.453125, + "epoch": 2.7655986509274872, + "grad_norm": 7.833469504528331, + "kl": 0.486328125, + "learning_rate": 4.4755480607082627e-07, + "loss": 0.0005, + "reward": 3.5080454349517822, + "reward_std": 0.044747334672138095, + "rewards/final_reward": 1.8708290131033436, + "rewards/mask_iou_reward": 0.9354145065516718, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5080453753471375, + "rewards/thk_ans_format_reward": 1.0, + "step": 1638, + "think_completion_length": 42.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.21875, + "epoch": 2.767284991568297, + "grad_norm": 20.33564956478746, + "kl": 0.7431640625, + "learning_rate": 4.472175379426644e-07, + "loss": 0.0007, + "reward": 3.184965491294861, + "reward_std": 0.13730136305093765, + "rewards/final_reward": 1.3746444918404204, + "rewards/mask_iou_reward": 0.6873222459202102, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.184965431690216, + "rewards/thk_ans_format_reward": 1.0, + "step": 1639, + "think_completion_length": 47.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.46875, + "epoch": 2.768971332209106, + "grad_norm": 9.139679927371628, + "kl": 0.537109375, + "learning_rate": 4.468802698145025e-07, + "loss": 0.0005, + "reward": 3.26485538482666, + "reward_std": 0.2971716374158859, + "rewards/final_reward": 0.9064043195540362, + "rewards/mask_iou_reward": 0.4532021597770181, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.2804805040359497, + "rewards/thk_ans_format_reward": 1.0, + "step": 1640, + "think_completion_length": 43.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.125, + "epoch": 2.7706576728499157, + "grad_norm": 28.423688155899523, + "kl": 0.58203125, + "learning_rate": 4.4654300168634064e-07, + "loss": 0.0006, + "reward": 3.2505204677581787, + "reward_std": 0.08370211534202099, + "rewards/final_reward": 0.9374799320219221, + "rewards/mask_iou_reward": 0.4687399660109611, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2505203485488892, + "rewards/thk_ans_format_reward": 1.0, + "step": 1641, + "think_completion_length": 43.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.09375, + "epoch": 2.7723440134907253, + "grad_norm": 48.70271629740253, + "kl": 0.568359375, + "learning_rate": 4.462057335581788e-07, + "loss": 0.0006, + "reward": 3.169440746307373, + "reward_std": 0.0842177951708436, + "rewards/final_reward": 1.3710371271332886, + "rewards/mask_iou_reward": 0.6855185635666443, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1694406569004059, + "rewards/thk_ans_format_reward": 1.0, + "step": 1642, + "think_completion_length": 43.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.734375, + "epoch": 2.7740303541315345, + "grad_norm": 7.366441989581299, + "kl": 0.5625, + "learning_rate": 4.458684654300168e-07, + "loss": 0.0006, + "reward": 3.6222939491271973, + "reward_std": 0.03713347762823105, + "rewards/final_reward": 1.5904692873556985, + "rewards/mask_iou_reward": 0.7952346436778492, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6222938299179077, + "rewards/thk_ans_format_reward": 1.0, + "step": 1643, + "think_completion_length": 43.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.25, + "epoch": 2.775716694772344, + "grad_norm": 9.447110316239264, + "kl": 0.626953125, + "learning_rate": 4.4553119730185496e-07, + "loss": 0.0006, + "reward": 3.4532470703125, + "reward_std": 0.20233439654111862, + "rewards/final_reward": 1.3015430654140039, + "rewards/mask_iou_reward": 0.6507715327070019, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4532470703125, + "rewards/thk_ans_format_reward": 1.0, + "step": 1644, + "think_completion_length": 41.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.703125, + "epoch": 2.7774030354131534, + "grad_norm": 13.954279040230144, + "kl": 0.4765625, + "learning_rate": 4.451939291736931e-07, + "loss": 0.0005, + "reward": 3.377991557121277, + "reward_std": 0.18850401416420937, + "rewards/final_reward": 1.3938890227534597, + "rewards/mask_iou_reward": 0.6969445113767299, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3779913783073425, + "rewards/thk_ans_format_reward": 1.0, + "step": 1645, + "think_completion_length": 44.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.015625, + "epoch": 2.779089376053963, + "grad_norm": 22.690826871154588, + "kl": 0.548828125, + "learning_rate": 4.4485666104553113e-07, + "loss": 0.0005, + "reward": 3.6439974308013916, + "reward_std": 0.18691938370466232, + "rewards/final_reward": 1.3127684972547642, + "rewards/mask_iou_reward": 0.6563842486273821, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6439975500106812, + "rewards/thk_ans_format_reward": 1.0, + "step": 1646, + "think_completion_length": 43.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.4375, + "epoch": 2.780775716694772, + "grad_norm": 5.949871279661459, + "kl": 0.5390625, + "learning_rate": 4.4451939291736927e-07, + "loss": 0.0005, + "reward": 3.5799875259399414, + "reward_std": 0.06551541201770306, + "rewards/final_reward": 1.5637152935744738, + "rewards/mask_iou_reward": 0.7818576467872369, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.579987347126007, + "rewards/thk_ans_format_reward": 1.0, + "step": 1647, + "think_completion_length": 48.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.0625, + "epoch": 2.782462057335582, + "grad_norm": 7.5002904571187665, + "kl": 0.60546875, + "learning_rate": 4.441821247892074e-07, + "loss": 0.0006, + "reward": 3.5384901762008667, + "reward_std": 0.10225693881511688, + "rewards/final_reward": 1.243734327595926, + "rewards/mask_iou_reward": 0.621867163797963, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.538490116596222, + "rewards/thk_ans_format_reward": 1.0, + "step": 1648, + "think_completion_length": 44.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.515625, + "epoch": 2.7841483979763915, + "grad_norm": 11.423476860723076, + "kl": 0.505859375, + "learning_rate": 4.438448566610455e-07, + "loss": 0.0005, + "reward": 3.378657102584839, + "reward_std": 0.011581235099583864, + "rewards/final_reward": 1.1859685848692345, + "rewards/mask_iou_reward": 0.5929842924346173, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.378657042980194, + "rewards/thk_ans_format_reward": 1.0, + "step": 1649, + "think_completion_length": 47.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.265625, + "epoch": 2.7858347386172007, + "grad_norm": 7.507930542476445, + "kl": 0.525390625, + "learning_rate": 4.4350758853288364e-07, + "loss": 0.0009, + "reward": 3.132522940635681, + "reward_std": 0.04552896483801305, + "rewards/final_reward": 0.7150072786373457, + "rewards/mask_iou_reward": 0.35750363931867285, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1325227916240692, + "rewards/thk_ans_format_reward": 1.0, + "step": 1650, + "think_completion_length": 52.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.90625, + "epoch": 2.78752107925801, + "grad_norm": 8.212728271467169, + "kl": 0.546875, + "learning_rate": 4.431703204047217e-07, + "loss": 0.0005, + "reward": 3.3425687551498413, + "reward_std": 0.1770862564444542, + "rewards/final_reward": 1.2308162768632087, + "rewards/mask_iou_reward": 0.6154081384316044, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3425687551498413, + "rewards/thk_ans_format_reward": 1.0, + "step": 1651, + "think_completion_length": 43.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.265625, + "epoch": 2.7892074198988195, + "grad_norm": 10.726955125370036, + "kl": 0.46875, + "learning_rate": 4.4283305227655987e-07, + "loss": 0.0005, + "reward": 3.3672693967819214, + "reward_std": 0.09276960045099258, + "rewards/final_reward": 1.5693248195881653, + "rewards/mask_iou_reward": 0.7846624097940826, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3672692775726318, + "rewards/thk_ans_format_reward": 1.0, + "step": 1652, + "think_completion_length": 52.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.78125, + "epoch": 2.790893760539629, + "grad_norm": 20.126983932706253, + "kl": 0.5302734375, + "learning_rate": 4.4249578414839795e-07, + "loss": 0.0005, + "reward": 2.6683385372161865, + "reward_std": 0.08216170221567154, + "rewards/final_reward": 0.280023642755239, + "rewards/mask_iou_reward": 0.1400118213776195, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6683385670185089, + "rewards/thk_ans_format_reward": 1.0, + "step": 1653, + "think_completion_length": 45.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.390625, + "epoch": 2.7925801011804383, + "grad_norm": 9.48556505248788, + "kl": 0.46484375, + "learning_rate": 4.421585160202361e-07, + "loss": 0.0005, + "reward": 3.599041700363159, + "reward_std": 0.22133435308933258, + "rewards/final_reward": 1.5422429531382462, + "rewards/mask_iou_reward": 0.7711214765691231, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5990417003631592, + "rewards/thk_ans_format_reward": 1.0, + "step": 1654, + "think_completion_length": 48.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.84375, + "epoch": 2.794266441821248, + "grad_norm": 8.033105157926789, + "kl": 0.546875, + "learning_rate": 4.418212478920742e-07, + "loss": 0.0005, + "reward": 3.0550079345703125, + "reward_std": 0.19160734862089157, + "rewards/final_reward": 0.37046362232971186, + "rewards/mask_iou_reward": 0.18523181116485593, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0550077557563782, + "rewards/thk_ans_format_reward": 1.0, + "step": 1655, + "think_completion_length": 53.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.90625, + "epoch": 2.7959527824620576, + "grad_norm": 5.009021405486533, + "kl": 0.548828125, + "learning_rate": 4.4148397976391227e-07, + "loss": 0.0006, + "reward": 3.5861432552337646, + "reward_std": 0.08710538037121296, + "rewards/final_reward": 1.5525587463652795, + "rewards/mask_iou_reward": 0.7762793731826397, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5861433148384094, + "rewards/thk_ans_format_reward": 1.0, + "step": 1656, + "think_completion_length": 42.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.96875, + "epoch": 2.7976391231028668, + "grad_norm": 8.854949474945357, + "kl": 0.5, + "learning_rate": 4.411467116357504e-07, + "loss": 0.0005, + "reward": 3.54121994972229, + "reward_std": 0.09857543557882309, + "rewards/final_reward": 1.4392045951501446, + "rewards/mask_iou_reward": 0.7196022975750723, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5412198901176453, + "rewards/thk_ans_format_reward": 1.0, + "step": 1657, + "think_completion_length": 46.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.0625, + "epoch": 2.799325463743676, + "grad_norm": 12.518744855651851, + "kl": 0.5625, + "learning_rate": 4.4080944350758855e-07, + "loss": 0.0006, + "reward": 3.2827905416488647, + "reward_std": 0.27441447228193283, + "rewards/final_reward": 1.0464232900290225, + "rewards/mask_iou_reward": 0.5232116450145112, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2827905416488647, + "rewards/thk_ans_format_reward": 1.0, + "step": 1658, + "think_completion_length": 43.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.53125, + "epoch": 2.8010118043844856, + "grad_norm": 7.398113473881379, + "kl": 0.568359375, + "learning_rate": 4.404721753794266e-07, + "loss": 0.0006, + "reward": 3.1118181943893433, + "reward_std": 0.09089295193552971, + "rewards/final_reward": 1.056219472565322, + "rewards/mask_iou_reward": 0.528109736282661, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1118182241916656, + "rewards/thk_ans_format_reward": 1.0, + "step": 1659, + "think_completion_length": 51.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.953125, + "epoch": 2.8026981450252952, + "grad_norm": 4.592629735441871, + "kl": 0.515625, + "learning_rate": 4.401349072512647e-07, + "loss": 0.0005, + "reward": 3.5405365228652954, + "reward_std": 0.058989531360566616, + "rewards/final_reward": 1.421891593709226, + "rewards/mask_iou_reward": 0.710945796854613, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5405365824699402, + "rewards/thk_ans_format_reward": 1.0, + "step": 1660, + "think_completion_length": 47.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.921875, + "epoch": 2.8043844856661044, + "grad_norm": 8.301478140894433, + "kl": 0.56640625, + "learning_rate": 4.3979763912310286e-07, + "loss": 0.0006, + "reward": 3.2338669300079346, + "reward_std": 0.20402198284864426, + "rewards/final_reward": 1.4756446384310582, + "rewards/mask_iou_reward": 0.7378223192155291, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2338669896125793, + "rewards/thk_ans_format_reward": 1.0, + "step": 1661, + "think_completion_length": 50.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.0, + "epoch": 2.806070826306914, + "grad_norm": 15.009090092095851, + "kl": 0.517578125, + "learning_rate": 4.3946037099494095e-07, + "loss": 0.0005, + "reward": 3.4566744565963745, + "reward_std": 0.1600627675652504, + "rewards/final_reward": 1.6646250918542926, + "rewards/mask_iou_reward": 0.8323125459271463, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4566745162010193, + "rewards/thk_ans_format_reward": 1.0, + "step": 1662, + "think_completion_length": 48.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.125, + "epoch": 2.8077571669477237, + "grad_norm": 7.411943859075892, + "kl": 0.537109375, + "learning_rate": 4.3912310286677904e-07, + "loss": 0.0005, + "reward": 3.2928357124328613, + "reward_std": 0.04496446065604687, + "rewards/final_reward": 1.0907731326858414, + "rewards/mask_iou_reward": 0.5453865663429207, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.292835682630539, + "rewards/thk_ans_format_reward": 1.0, + "step": 1663, + "think_completion_length": 45.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.0, + "epoch": 2.809443507588533, + "grad_norm": 5.522931760133354, + "kl": 0.693359375, + "learning_rate": 4.387858347386172e-07, + "loss": 0.0007, + "reward": 2.4156452417373657, + "reward_std": 0.27752088755369186, + "rewards/final_reward": 0.5370063073601792, + "rewards/mask_iou_reward": 0.2685031536800896, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.41564518958330154, + "rewards/thk_ans_format_reward": 1.0, + "step": 1664, + "think_completion_length": 46.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.96875, + "epoch": 2.811129848229342, + "grad_norm": 12.48393400789474, + "kl": 0.8046875, + "learning_rate": 4.384485666104553e-07, + "loss": 0.0008, + "reward": 3.167549252510071, + "reward_std": 0.1540435515344143, + "rewards/final_reward": 1.2255837292741503, + "rewards/mask_iou_reward": 0.6127918646370751, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1675493121147156, + "rewards/thk_ans_format_reward": 1.0, + "step": 1665, + "think_completion_length": 43.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.484375, + "epoch": 2.8128161888701517, + "grad_norm": 7.842676773141203, + "kl": 0.54296875, + "learning_rate": 4.381112984822934e-07, + "loss": 0.0003, + "reward": 3.243940830230713, + "reward_std": 0.0764221902936697, + "rewards/final_reward": 1.1349538626751443, + "rewards/mask_iou_reward": 0.5674769313375722, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.243940830230713, + "rewards/thk_ans_format_reward": 1.0, + "step": 1666, + "think_completion_length": 52.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.265625, + "epoch": 2.8145025295109614, + "grad_norm": 7.420756090615668, + "kl": 0.802734375, + "learning_rate": 4.377740303541315e-07, + "loss": 0.0008, + "reward": 3.0553218126296997, + "reward_std": 0.1333809532225132, + "rewards/final_reward": 1.877014424743797, + "rewards/mask_iou_reward": 0.9385072123718985, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0553217232227325, + "rewards/thk_ans_format_reward": 1.0, + "step": 1667, + "think_completion_length": 49.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.921875, + "epoch": 2.8161888701517706, + "grad_norm": 20.9164669985359, + "kl": 0.57421875, + "learning_rate": 4.3743676222596963e-07, + "loss": 0.0006, + "reward": 3.2621958255767822, + "reward_std": 0.10390813648700714, + "rewards/final_reward": 1.5714543200341808, + "rewards/mask_iou_reward": 0.7857271600170904, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.262195885181427, + "rewards/thk_ans_format_reward": 1.0, + "step": 1668, + "think_completion_length": 54.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.46875, + "epoch": 2.81787521079258, + "grad_norm": 11.46581781710695, + "kl": 0.5087890625, + "learning_rate": 4.370994940978077e-07, + "loss": 0.0005, + "reward": 3.0988789796829224, + "reward_std": 0.16297003626823425, + "rewards/final_reward": 0.3341480251757681, + "rewards/mask_iou_reward": 0.16707401258788404, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0988790392875671, + "rewards/thk_ans_format_reward": 1.0, + "step": 1669, + "think_completion_length": 49.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.640625, + "epoch": 2.8195615514333894, + "grad_norm": 12.987290901235163, + "kl": 0.533203125, + "learning_rate": 4.3676222596964586e-07, + "loss": 0.0006, + "reward": 2.8739062547683716, + "reward_std": 0.10603522881865501, + "rewards/final_reward": 0.9522916842171223, + "rewards/mask_iou_reward": 0.47614584210856115, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8739061681553721, + "rewards/thk_ans_format_reward": 1.0, + "step": 1670, + "think_completion_length": 45.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.015625, + "epoch": 2.821247892074199, + "grad_norm": 6.2920481506061385, + "kl": 0.48046875, + "learning_rate": 4.3642495784148395e-07, + "loss": 0.0005, + "reward": 3.123382806777954, + "reward_std": 0.05944320000708103, + "rewards/final_reward": 1.2645310449635334, + "rewards/mask_iou_reward": 0.6322655224817667, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1233826875686646, + "rewards/thk_ans_format_reward": 1.0, + "step": 1671, + "think_completion_length": 52.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.15625, + "epoch": 2.822934232715008, + "grad_norm": 4.193921693846766, + "kl": 0.609375, + "learning_rate": 4.3608768971332204e-07, + "loss": 0.0006, + "reward": 3.4360289573669434, + "reward_std": 0.1628253385424614, + "rewards/final_reward": 1.405196945674128, + "rewards/mask_iou_reward": 0.702598472837064, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4360288381576538, + "rewards/thk_ans_format_reward": 1.0, + "step": 1672, + "think_completion_length": 49.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.3125, + "epoch": 2.824620573355818, + "grad_norm": 13.421029881471101, + "kl": 0.67578125, + "learning_rate": 4.357504215851602e-07, + "loss": 0.0007, + "reward": 3.248274564743042, + "reward_std": 0.026005716295912862, + "rewards/final_reward": 1.0280415642752836, + "rewards/mask_iou_reward": 0.5140207821376418, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2482746243476868, + "rewards/thk_ans_format_reward": 1.0, + "step": 1673, + "think_completion_length": 47.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.109375, + "epoch": 2.8263069139966275, + "grad_norm": 13.271196275319467, + "kl": 0.484375, + "learning_rate": 4.354131534569983e-07, + "loss": 0.0005, + "reward": 2.8315566778182983, + "reward_std": 0.30540551617741585, + "rewards/final_reward": 0.9391927953720351, + "rewards/mask_iou_reward": 0.4695963976860176, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 0.894056499004364, + "rewards/thk_ans_format_reward": 0.96875, + "step": 1674, + "think_completion_length": 47.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.90625, + "epoch": 2.8279932546374367, + "grad_norm": 8.455936572709941, + "kl": 0.552734375, + "learning_rate": 4.3507588532883635e-07, + "loss": 0.0006, + "reward": 3.7103850841522217, + "reward_std": 0.12102552060969174, + "rewards/final_reward": 1.8495349146417959, + "rewards/mask_iou_reward": 0.9247674573208979, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.710385262966156, + "rewards/thk_ans_format_reward": 1.0, + "step": 1675, + "think_completion_length": 49.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.390625, + "epoch": 2.8296795952782463, + "grad_norm": 23.85193166056514, + "kl": 0.4775390625, + "learning_rate": 4.347386172006745e-07, + "loss": 0.0005, + "reward": 3.2892091274261475, + "reward_std": 0.2604014202952385, + "rewards/final_reward": 1.5074901698516783, + "rewards/mask_iou_reward": 0.7537450849258391, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2892090678215027, + "rewards/thk_ans_format_reward": 1.0, + "step": 1676, + "think_completion_length": 47.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.25, + "epoch": 2.8313659359190555, + "grad_norm": 5.168884073555499, + "kl": 0.56640625, + "learning_rate": 4.3440134907251263e-07, + "loss": 0.0006, + "reward": 3.3190125226974487, + "reward_std": 0.20287639647722244, + "rewards/final_reward": 1.5806630389857197, + "rewards/mask_iou_reward": 0.7903315194928598, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3190125823020935, + "rewards/thk_ans_format_reward": 1.0, + "step": 1677, + "think_completion_length": 48.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.6875, + "epoch": 2.833052276559865, + "grad_norm": 92.17630352695905, + "kl": 0.576171875, + "learning_rate": 4.3406408094435077e-07, + "loss": 0.0006, + "reward": 3.4132070541381836, + "reward_std": 0.11548706330358982, + "rewards/final_reward": 1.3225807783636303, + "rewards/mask_iou_reward": 0.6612903891818152, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4132071137428284, + "rewards/thk_ans_format_reward": 1.0, + "step": 1678, + "think_completion_length": 51.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.0625, + "epoch": 2.8347386172006743, + "grad_norm": 9.76328541975843, + "kl": 0.537109375, + "learning_rate": 4.3372681281618886e-07, + "loss": 0.0005, + "reward": 2.71964168548584, + "reward_std": 0.3583778738975525, + "rewards/final_reward": 0.7354924743235481, + "rewards/mask_iou_reward": 0.36774623716177407, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7196417450904846, + "rewards/thk_ans_format_reward": 1.0, + "step": 1679, + "think_completion_length": 48.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.71875, + "epoch": 2.836424957841484, + "grad_norm": 18.884921118766233, + "kl": 0.546875, + "learning_rate": 4.3338954468802695e-07, + "loss": 0.0005, + "reward": 3.5863040685653687, + "reward_std": 0.16526619624346495, + "rewards/final_reward": 1.5597678148528689, + "rewards/mask_iou_reward": 0.7798839074264344, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5863041877746582, + "rewards/thk_ans_format_reward": 1.0, + "step": 1680, + "think_completion_length": 49.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.421875, + "epoch": 2.8381112984822936, + "grad_norm": 5.336722180196324, + "kl": 0.466796875, + "learning_rate": 4.330522765598651e-07, + "loss": 0.0005, + "reward": 3.214847207069397, + "reward_std": 0.12801437883172184, + "rewards/final_reward": 1.580831129811842, + "rewards/mask_iou_reward": 0.790415564905921, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.214847207069397, + "rewards/thk_ans_format_reward": 1.0, + "step": 1681, + "think_completion_length": 52.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.328125, + "epoch": 2.839797639123103, + "grad_norm": 5.561161774358996, + "kl": 0.474609375, + "learning_rate": 4.327150084317032e-07, + "loss": 0.0005, + "reward": 3.457669496536255, + "reward_std": 0.13381371274590492, + "rewards/final_reward": 1.054481489107785, + "rewards/mask_iou_reward": 0.5272407445538925, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4576694965362549, + "rewards/thk_ans_format_reward": 1.0, + "step": 1682, + "think_completion_length": 51.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.75, + "epoch": 2.8414839797639124, + "grad_norm": 8.289504941062013, + "kl": 0.525390625, + "learning_rate": 4.323777403035413e-07, + "loss": 0.0005, + "reward": 3.3733558654785156, + "reward_std": 0.2661462351679802, + "rewards/final_reward": 1.5664154016541394, + "rewards/mask_iou_reward": 0.7832077008270697, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 1.40460604429245, + "rewards/thk_ans_format_reward": 1.0, + "step": 1683, + "think_completion_length": 49.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.140625, + "epoch": 2.8431703204047216, + "grad_norm": 11.426186675709356, + "kl": 0.482421875, + "learning_rate": 4.320404721753794e-07, + "loss": 0.0005, + "reward": 3.723689556121826, + "reward_std": 0.08870341628789902, + "rewards/final_reward": 1.8753199432935024, + "rewards/mask_iou_reward": 0.9376599716467512, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7236894965171814, + "rewards/thk_ans_format_reward": 1.0, + "step": 1684, + "think_completion_length": 52.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.125, + "epoch": 2.8448566610455313, + "grad_norm": 12.582730794114397, + "kl": 0.5380859375, + "learning_rate": 4.317032040472175e-07, + "loss": 0.0005, + "reward": 2.6829700469970703, + "reward_std": 0.24802841991186142, + "rewards/final_reward": 0.32948334050492945, + "rewards/mask_iou_reward": 0.16474167025246472, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6829699873924255, + "rewards/thk_ans_format_reward": 1.0, + "step": 1685, + "think_completion_length": 47.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.078125, + "epoch": 2.8465430016863404, + "grad_norm": 12.427712787402632, + "kl": 0.654296875, + "learning_rate": 4.3136593591905563e-07, + "loss": 0.0007, + "reward": 3.4638320207595825, + "reward_std": 0.2700415402650833, + "rewards/final_reward": 1.4534286260477245, + "rewards/mask_iou_reward": 0.7267143130238622, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4638320207595825, + "rewards/thk_ans_format_reward": 1.0, + "step": 1686, + "think_completion_length": 55.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.15625, + "epoch": 2.84822934232715, + "grad_norm": 11.38645102336239, + "kl": 0.541015625, + "learning_rate": 4.3102866779089377e-07, + "loss": 0.0005, + "reward": 3.382949709892273, + "reward_std": 0.3745395615696907, + "rewards/final_reward": 1.6297009222230234, + "rewards/mask_iou_reward": 0.8148504611115117, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.382949709892273, + "rewards/thk_ans_format_reward": 1.0, + "step": 1687, + "think_completion_length": 54.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.234375, + "epoch": 2.8499156829679597, + "grad_norm": 16.14129074118289, + "kl": 0.537109375, + "learning_rate": 4.306913996627318e-07, + "loss": 0.0005, + "reward": 3.3319497108459473, + "reward_std": 0.0831929137930274, + "rewards/final_reward": 1.1821064456849155, + "rewards/mask_iou_reward": 0.5910532228424578, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3319497108459473, + "rewards/thk_ans_format_reward": 1.0, + "step": 1688, + "think_completion_length": 48.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.625, + "epoch": 2.851602023608769, + "grad_norm": 103.81113268417846, + "kl": 0.57421875, + "learning_rate": 4.3035413153456995e-07, + "loss": 0.0006, + "reward": 3.1938982009887695, + "reward_std": 0.09463486075401306, + "rewards/final_reward": 0.8637187013740543, + "rewards/mask_iou_reward": 0.43185935068702713, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.19389808177948, + "rewards/thk_ans_format_reward": 1.0, + "step": 1689, + "think_completion_length": 46.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.734375, + "epoch": 2.8532883642495785, + "grad_norm": 18.93569565214868, + "kl": 0.5244140625, + "learning_rate": 4.300168634064081e-07, + "loss": 0.0005, + "reward": 2.992497682571411, + "reward_std": 0.07934637367725372, + "rewards/final_reward": 0.7805767257267343, + "rewards/mask_iou_reward": 0.39028836286336716, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9924976080656052, + "rewards/thk_ans_format_reward": 1.0, + "step": 1690, + "think_completion_length": 51.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.84375, + "epoch": 2.8549747048903877, + "grad_norm": 6.227094648425705, + "kl": 0.5849609375, + "learning_rate": 4.2967959527824623e-07, + "loss": 0.0006, + "reward": 3.5347208976745605, + "reward_std": 0.02190372860059142, + "rewards/final_reward": 1.175894394798423, + "rewards/mask_iou_reward": 0.5879471973992115, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.534720778465271, + "rewards/thk_ans_format_reward": 1.0, + "step": 1691, + "think_completion_length": 46.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.796875, + "epoch": 2.8566610455311974, + "grad_norm": 15.977781215682915, + "kl": 0.57421875, + "learning_rate": 4.2934232715008426e-07, + "loss": 0.0006, + "reward": 3.471309781074524, + "reward_std": 0.12639077939093113, + "rewards/final_reward": 1.8368525076066613, + "rewards/mask_iou_reward": 0.9184262538033306, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.471309781074524, + "rewards/thk_ans_format_reward": 1.0, + "step": 1692, + "think_completion_length": 45.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.421875, + "epoch": 2.8583473861720066, + "grad_norm": 17.817364501759883, + "kl": 1.21875, + "learning_rate": 4.290050590219224e-07, + "loss": 0.0012, + "reward": 3.349412679672241, + "reward_std": 0.02023978717625141, + "rewards/final_reward": 1.6064038112818841, + "rewards/mask_iou_reward": 0.8032019056409421, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3494127988815308, + "rewards/thk_ans_format_reward": 1.0, + "step": 1693, + "think_completion_length": 45.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.828125, + "epoch": 2.860033726812816, + "grad_norm": 11.180486783020493, + "kl": 0.546875, + "learning_rate": 4.2866779089376054e-07, + "loss": 0.0005, + "reward": 3.344439148902893, + "reward_std": 0.05913896486163139, + "rewards/final_reward": 1.2388363326863248, + "rewards/mask_iou_reward": 0.6194181663431624, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3444392085075378, + "rewards/thk_ans_format_reward": 1.0, + "step": 1694, + "think_completion_length": 46.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.796875, + "epoch": 2.861720067453626, + "grad_norm": 7.891941279533489, + "kl": 0.5078125, + "learning_rate": 4.2833052276559863e-07, + "loss": 0.0005, + "reward": 3.255765199661255, + "reward_std": 0.1523425281047821, + "rewards/final_reward": 1.1536836657640526, + "rewards/mask_iou_reward": 0.5768418328820263, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2557652592658997, + "rewards/thk_ans_format_reward": 1.0, + "step": 1695, + "think_completion_length": 50.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.34375, + "epoch": 2.863406408094435, + "grad_norm": 4.954447968205195, + "kl": 0.697265625, + "learning_rate": 4.279932546374367e-07, + "loss": 0.0007, + "reward": 3.497539758682251, + "reward_std": 0.04229480121284723, + "rewards/final_reward": 1.5405165870513375, + "rewards/mask_iou_reward": 0.7702582935256688, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4975398182868958, + "rewards/thk_ans_format_reward": 1.0, + "step": 1696, + "think_completion_length": 45.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.359375, + "epoch": 2.8650927487352447, + "grad_norm": 10.248081433569746, + "kl": 0.529296875, + "learning_rate": 4.2765598650927486e-07, + "loss": 0.0005, + "reward": 3.390744209289551, + "reward_std": 0.09942889865487814, + "rewards/final_reward": 0.9318678485841465, + "rewards/mask_iou_reward": 0.46593392429207325, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3907442688941956, + "rewards/thk_ans_format_reward": 1.0, + "step": 1697, + "think_completion_length": 51.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.671875, + "epoch": 2.866779089376054, + "grad_norm": 19.360336118338246, + "kl": 0.5625, + "learning_rate": 4.2731871838111294e-07, + "loss": 0.0006, + "reward": 3.254174590110779, + "reward_std": 0.34707289934158325, + "rewards/final_reward": 0.8415739413701474, + "rewards/mask_iou_reward": 0.4207869706850737, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2541745901107788, + "rewards/thk_ans_format_reward": 1.0, + "step": 1698, + "think_completion_length": 53.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.890625, + "epoch": 2.8684654300168635, + "grad_norm": 17.160014335485958, + "kl": 1.837890625, + "learning_rate": 4.269814502529511e-07, + "loss": 0.0018, + "reward": 3.398214817047119, + "reward_std": 0.12190900649875402, + "rewards/final_reward": 1.4416676597779143, + "rewards/mask_iou_reward": 0.7208338298889572, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3982148170471191, + "rewards/thk_ans_format_reward": 1.0, + "step": 1699, + "think_completion_length": 46.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.765625, + "epoch": 2.8701517706576727, + "grad_norm": 5.664639260401403, + "kl": 0.490234375, + "learning_rate": 4.2664418212478917e-07, + "loss": 0.0005, + "reward": 3.334146022796631, + "reward_std": 0.04577235411852598, + "rewards/final_reward": 1.062847271016929, + "rewards/mask_iou_reward": 0.5314236355084645, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3341458439826965, + "rewards/thk_ans_format_reward": 1.0, + "step": 1700, + "think_completion_length": 48.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.515625, + "epoch": 2.8718381112984823, + "grad_norm": 6.177565453121126, + "kl": 0.4443359375, + "learning_rate": 4.263069139966273e-07, + "loss": 0.0004, + "reward": 3.4637255668640137, + "reward_std": 0.09016487468034029, + "rewards/final_reward": 1.1509884523194014, + "rewards/mask_iou_reward": 0.5754942261597007, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4637253284454346, + "rewards/thk_ans_format_reward": 1.0, + "step": 1701, + "think_completion_length": 43.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.796875, + "epoch": 2.873524451939292, + "grad_norm": 7.6573470902159775, + "kl": 0.4873046875, + "learning_rate": 4.259696458684654e-07, + "loss": 0.0005, + "reward": 3.3190892934799194, + "reward_std": 0.1383841149508953, + "rewards/final_reward": 0.7692882700656908, + "rewards/mask_iou_reward": 0.3846441350328454, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3190893530845642, + "rewards/thk_ans_format_reward": 1.0, + "step": 1702, + "think_completion_length": 55.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.96875, + "epoch": 2.875210792580101, + "grad_norm": 16.530129336864004, + "kl": 0.55859375, + "learning_rate": 4.2563237774030354e-07, + "loss": 0.0006, + "reward": 3.119685411453247, + "reward_std": 0.1856657639145851, + "rewards/final_reward": 0.9465999923214892, + "rewards/mask_iou_reward": 0.4732999961607446, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1196853816509247, + "rewards/thk_ans_format_reward": 1.0, + "step": 1703, + "think_completion_length": 48.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.609375, + "epoch": 2.876897133220911, + "grad_norm": 5.611754103632963, + "kl": 0.587890625, + "learning_rate": 4.252951096121417e-07, + "loss": 0.0006, + "reward": 3.011402726173401, + "reward_std": 0.017416599672287703, + "rewards/final_reward": 0.44855465136059125, + "rewards/mask_iou_reward": 0.22427732568029562, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0114028453826904, + "rewards/thk_ans_format_reward": 1.0, + "step": 1704, + "think_completion_length": 52.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.40625, + "epoch": 2.87858347386172, + "grad_norm": 10.274447825876113, + "kl": 0.708984375, + "learning_rate": 4.249578414839797e-07, + "loss": 0.0007, + "reward": 3.3437150716781616, + "reward_std": 0.1217598095536232, + "rewards/final_reward": 1.7857162257774482, + "rewards/mask_iou_reward": 0.8928581128887241, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3437150716781616, + "rewards/thk_ans_format_reward": 1.0, + "step": 1705, + "think_completion_length": 45.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.6875, + "epoch": 2.8802698145025296, + "grad_norm": 5.036390750197868, + "kl": 0.564453125, + "learning_rate": 4.2462057335581786e-07, + "loss": 0.0006, + "reward": 3.1259138584136963, + "reward_std": 0.014334550127387047, + "rewards/final_reward": 1.2241493610339746, + "rewards/mask_iou_reward": 0.6120746805169873, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1259138584136963, + "rewards/thk_ans_format_reward": 1.0, + "step": 1706, + "think_completion_length": 45.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.4375, + "epoch": 2.881956155143339, + "grad_norm": 14.405343265668892, + "kl": 0.505859375, + "learning_rate": 4.24283305227656e-07, + "loss": 0.0005, + "reward": 3.0240617990493774, + "reward_std": 0.11181307956576347, + "rewards/final_reward": 1.4497086806740294, + "rewards/mask_iou_reward": 0.7248543403370147, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0240616202354431, + "rewards/thk_ans_format_reward": 1.0, + "step": 1707, + "think_completion_length": 51.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.125, + "epoch": 2.8836424957841484, + "grad_norm": 8.712780147567514, + "kl": 0.537109375, + "learning_rate": 4.239460370994941e-07, + "loss": 0.0005, + "reward": 3.6914472579956055, + "reward_std": 0.13960205670446157, + "rewards/final_reward": 1.730837492291958, + "rewards/mask_iou_reward": 0.865418746145979, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6914473176002502, + "rewards/thk_ans_format_reward": 1.0, + "step": 1708, + "think_completion_length": 46.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.75, + "epoch": 2.885328836424958, + "grad_norm": 119.6328886128793, + "kl": 0.6796875, + "learning_rate": 4.2360876897133217e-07, + "loss": 0.0007, + "reward": 3.302128791809082, + "reward_std": 0.04586852062493563, + "rewards/final_reward": 0.9640497744462393, + "rewards/mask_iou_reward": 0.48202488722311965, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3021288514137268, + "rewards/thk_ans_format_reward": 1.0, + "step": 1709, + "think_completion_length": 54.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.609375, + "epoch": 2.8870151770657673, + "grad_norm": 10.790821051390031, + "kl": 0.642578125, + "learning_rate": 4.232715008431703e-07, + "loss": 0.0006, + "reward": 3.2432212829589844, + "reward_std": 0.25815831683576107, + "rewards/final_reward": 1.198961451519287, + "rewards/mask_iou_reward": 0.5994807257596435, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2432212829589844, + "rewards/thk_ans_format_reward": 1.0, + "step": 1710, + "think_completion_length": 56.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.765625, + "epoch": 2.8887015177065765, + "grad_norm": 8.464795399420126, + "kl": 0.5009765625, + "learning_rate": 4.229342327150084e-07, + "loss": 0.0005, + "reward": 3.3584113121032715, + "reward_std": 0.09209583140909672, + "rewards/final_reward": 1.1509888312868992, + "rewards/mask_iou_reward": 0.5754944156434496, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3584113717079163, + "rewards/thk_ans_format_reward": 1.0, + "step": 1711, + "think_completion_length": 47.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.078125, + "epoch": 2.890387858347386, + "grad_norm": 9.531788486775692, + "kl": 0.501953125, + "learning_rate": 4.2259696458684654e-07, + "loss": 0.0005, + "reward": 3.5532913208007812, + "reward_std": 0.10277672484517097, + "rewards/final_reward": 1.6036280466363646, + "rewards/mask_iou_reward": 0.8018140233181823, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5532912611961365, + "rewards/thk_ans_format_reward": 1.0, + "step": 1712, + "think_completion_length": 47.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.796875, + "epoch": 2.8920741989881957, + "grad_norm": 24.107209669435377, + "kl": 0.4765625, + "learning_rate": 4.222596964586846e-07, + "loss": 0.0005, + "reward": 3.5580878257751465, + "reward_std": 0.05756748793646693, + "rewards/final_reward": 1.4146137485463521, + "rewards/mask_iou_reward": 0.7073068742731761, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.558087944984436, + "rewards/thk_ans_format_reward": 1.0, + "step": 1713, + "think_completion_length": 47.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.09375, + "epoch": 2.893760539629005, + "grad_norm": 7.758736439924713, + "kl": 0.533203125, + "learning_rate": 4.2192242833052277e-07, + "loss": 0.0005, + "reward": 3.5002644062042236, + "reward_std": 0.2005770057439804, + "rewards/final_reward": 1.2092635447748536, + "rewards/mask_iou_reward": 0.6046317723874268, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5002645254135132, + "rewards/thk_ans_format_reward": 1.0, + "step": 1714, + "think_completion_length": 46.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.9375, + "epoch": 2.8954468802698146, + "grad_norm": 6.39981184493797, + "kl": 0.658203125, + "learning_rate": 4.2158516020236085e-07, + "loss": 0.0007, + "reward": 3.34832501411438, + "reward_std": 0.07436983287334442, + "rewards/final_reward": 1.7210072380912143, + "rewards/mask_iou_reward": 0.8605036190456071, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3483251929283142, + "rewards/thk_ans_format_reward": 1.0, + "step": 1715, + "think_completion_length": 44.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.65625, + "epoch": 2.897133220910624, + "grad_norm": 6.109057908741958, + "kl": 0.521484375, + "learning_rate": 4.21247892074199e-07, + "loss": 0.0005, + "reward": 3.4417446851730347, + "reward_std": 0.007995732361450791, + "rewards/final_reward": 0.9722025745398519, + "rewards/mask_iou_reward": 0.48610128726992596, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4417446851730347, + "rewards/thk_ans_format_reward": 1.0, + "step": 1716, + "think_completion_length": 45.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.453125, + "epoch": 2.8988195615514334, + "grad_norm": 8.108779936322028, + "kl": 0.693359375, + "learning_rate": 4.209106239460371e-07, + "loss": 0.0007, + "reward": 3.0282063484191895, + "reward_std": 0.29450612515211105, + "rewards/final_reward": 0.9873438877122154, + "rewards/mask_iou_reward": 0.4936719438561077, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0282064378261566, + "rewards/thk_ans_format_reward": 1.0, + "step": 1717, + "think_completion_length": 51.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.9375, + "epoch": 2.9005059021922426, + "grad_norm": 7.526311247628863, + "kl": 0.5322265625, + "learning_rate": 4.2057335581787517e-07, + "loss": 0.0005, + "reward": 3.740854859352112, + "reward_std": 0.03472239035181701, + "rewards/final_reward": 1.7201866005635802, + "rewards/mask_iou_reward": 0.8600933002817901, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7408548593521118, + "rewards/thk_ans_format_reward": 1.0, + "step": 1718, + "think_completion_length": 45.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.375, + "epoch": 2.902192242833052, + "grad_norm": 17.340575174288656, + "kl": 0.560546875, + "learning_rate": 4.202360876897133e-07, + "loss": 0.0006, + "reward": 3.252333164215088, + "reward_std": 0.09733846783638, + "rewards/final_reward": 1.877978983533691, + "rewards/mask_iou_reward": 0.9389894917668455, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.252333164215088, + "rewards/thk_ans_format_reward": 1.0, + "step": 1719, + "think_completion_length": 47.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.09375, + "epoch": 2.903878583473862, + "grad_norm": 11.916990798274322, + "kl": 0.525390625, + "learning_rate": 4.1989881956155145e-07, + "loss": 0.0005, + "reward": 3.2642595767974854, + "reward_std": 0.3322697635740042, + "rewards/final_reward": 1.6571951719452607, + "rewards/mask_iou_reward": 0.8285975859726303, + "rewards/sam_format_reward": 0.953125, + "rewards/sam_reward_func_ultra": 1.3580095767974854, + "rewards/thk_ans_format_reward": 0.953125, + "step": 1720, + "think_completion_length": 49.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.28125, + "epoch": 2.905564924114671, + "grad_norm": 60.51395728112135, + "kl": 0.578125, + "learning_rate": 4.195615514333895e-07, + "loss": 0.0005, + "reward": 3.8474960327148438, + "reward_std": 0.09723218204453588, + "rewards/final_reward": 1.8935047756335932, + "rewards/mask_iou_reward": 0.9467523878167966, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8474960327148438, + "rewards/thk_ans_format_reward": 1.0, + "step": 1721, + "think_completion_length": 48.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.96875, + "epoch": 2.9072512647554807, + "grad_norm": 6.1129558108301865, + "kl": 0.5703125, + "learning_rate": 4.192242833052276e-07, + "loss": 0.0005, + "reward": 3.4524412155151367, + "reward_std": 0.08695713616907597, + "rewards/final_reward": 1.4480904777145365, + "rewards/mask_iou_reward": 0.7240452388572682, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.452441155910492, + "rewards/thk_ans_format_reward": 1.0, + "step": 1722, + "think_completion_length": 47.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.625, + "epoch": 2.9089376053962903, + "grad_norm": 12.061966103728397, + "kl": 0.56640625, + "learning_rate": 4.1888701517706576e-07, + "loss": 0.0006, + "reward": 3.232837438583374, + "reward_std": 0.16182934492826462, + "rewards/final_reward": 1.3887647567204833, + "rewards/mask_iou_reward": 0.6943823783602416, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2328372597694397, + "rewards/thk_ans_format_reward": 1.0, + "step": 1723, + "think_completion_length": 53.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.84375, + "epoch": 2.9106239460370995, + "grad_norm": 9.271259487912298, + "kl": 0.615234375, + "learning_rate": 4.1854974704890385e-07, + "loss": 0.0006, + "reward": 3.0772303342819214, + "reward_std": 0.19635407999157906, + "rewards/final_reward": 1.4006259692580718, + "rewards/mask_iou_reward": 0.7003129846290359, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0772303342819214, + "rewards/thk_ans_format_reward": 1.0, + "step": 1724, + "think_completion_length": 46.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.78125, + "epoch": 2.9123102866779087, + "grad_norm": 14.68120288683905, + "kl": 0.599609375, + "learning_rate": 4.1821247892074194e-07, + "loss": 0.0006, + "reward": 3.557195544242859, + "reward_std": 0.06366473622620106, + "rewards/final_reward": 1.2397970681721528, + "rewards/mask_iou_reward": 0.6198985340860764, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5571956038475037, + "rewards/thk_ans_format_reward": 1.0, + "step": 1725, + "think_completion_length": 48.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.1875, + "epoch": 2.9139966273187183, + "grad_norm": 13.195233955342133, + "kl": 0.533203125, + "learning_rate": 4.178752107925801e-07, + "loss": 0.0005, + "reward": 3.4791808128356934, + "reward_std": 0.0421409523114562, + "rewards/final_reward": 1.5218060859800824, + "rewards/mask_iou_reward": 0.7609030429900412, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.479180932044983, + "rewards/thk_ans_format_reward": 1.0, + "step": 1726, + "think_completion_length": 52.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.171875, + "epoch": 2.915682967959528, + "grad_norm": 6.374675918777848, + "kl": 0.56640625, + "learning_rate": 4.175379426644182e-07, + "loss": 0.0006, + "reward": 3.0983314514160156, + "reward_std": 0.128284377977252, + "rewards/final_reward": 1.0925805377893214, + "rewards/mask_iou_reward": 0.5462902688946607, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0983315706253052, + "rewards/thk_ans_format_reward": 1.0, + "step": 1727, + "think_completion_length": 50.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.40625, + "epoch": 2.917369308600337, + "grad_norm": 11.56261552116547, + "kl": 0.55078125, + "learning_rate": 4.172006745362563e-07, + "loss": 0.0006, + "reward": 3.524160861968994, + "reward_std": 0.06520858220756054, + "rewards/final_reward": 1.3922147334798918, + "rewards/mask_iou_reward": 0.6961073667399459, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5241608619689941, + "rewards/thk_ans_format_reward": 1.0, + "step": 1728, + "think_completion_length": 41.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.421875, + "epoch": 2.919055649241147, + "grad_norm": 11.645755711489283, + "kl": 0.64453125, + "learning_rate": 4.168634064080944e-07, + "loss": 0.0006, + "reward": 3.8355716466903687, + "reward_std": 0.004892201977781951, + "rewards/final_reward": 1.9233353356845395, + "rewards/mask_iou_reward": 0.9616676678422698, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.835571825504303, + "rewards/thk_ans_format_reward": 1.0, + "step": 1729, + "think_completion_length": 44.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.375, + "epoch": 2.920741989881956, + "grad_norm": 11.776779803616577, + "kl": 0.564453125, + "learning_rate": 4.1652613827993254e-07, + "loss": 0.0006, + "reward": 3.778007984161377, + "reward_std": 0.0795029029250145, + "rewards/final_reward": 1.8544093301855198, + "rewards/mask_iou_reward": 0.9272046650927599, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.778007984161377, + "rewards/thk_ans_format_reward": 1.0, + "step": 1730, + "think_completion_length": 51.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.578125, + "epoch": 2.9224283305227656, + "grad_norm": 55.579732230614546, + "kl": 0.6328125, + "learning_rate": 4.161888701517706e-07, + "loss": 0.0006, + "reward": 3.334317922592163, + "reward_std": 0.12129126489162445, + "rewards/final_reward": 1.2962656919236346, + "rewards/mask_iou_reward": 0.6481328459618173, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3343179821968079, + "rewards/thk_ans_format_reward": 1.0, + "step": 1731, + "think_completion_length": 45.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.234375, + "epoch": 2.924114671163575, + "grad_norm": 18.804690139625475, + "kl": 0.66015625, + "learning_rate": 4.1585160202360876e-07, + "loss": 0.0007, + "reward": 3.188036322593689, + "reward_std": 0.13786154240369797, + "rewards/final_reward": 1.4687809361757644, + "rewards/mask_iou_reward": 0.7343904680878822, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.203661322593689, + "rewards/thk_ans_format_reward": 1.0, + "step": 1732, + "think_completion_length": 51.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.1875, + "epoch": 2.9258010118043845, + "grad_norm": 18.20063333168867, + "kl": 0.955078125, + "learning_rate": 4.155143338954469e-07, + "loss": 0.001, + "reward": 3.518056869506836, + "reward_std": 0.3514831140637398, + "rewards/final_reward": 1.550659748466668, + "rewards/mask_iou_reward": 0.775329874233334, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5180569291114807, + "rewards/thk_ans_format_reward": 1.0, + "step": 1733, + "think_completion_length": 50.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.578125, + "epoch": 2.927487352445194, + "grad_norm": 39.32673591485834, + "kl": 0.58203125, + "learning_rate": 4.1517706576728494e-07, + "loss": 0.0006, + "reward": 3.3363709449768066, + "reward_std": 0.06575199589133263, + "rewards/final_reward": 1.2627733196875328, + "rewards/mask_iou_reward": 0.6313866598437664, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3363710045814514, + "rewards/thk_ans_format_reward": 1.0, + "step": 1734, + "think_completion_length": 40.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.984375, + "epoch": 2.9291736930860033, + "grad_norm": 7.815025623286234, + "kl": 0.708984375, + "learning_rate": 4.148397976391231e-07, + "loss": 0.0007, + "reward": 3.329333782196045, + "reward_std": 0.27078694477677345, + "rewards/final_reward": 1.246577492295484, + "rewards/mask_iou_reward": 0.623288746147742, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.3605839014053345, + "rewards/thk_ans_format_reward": 0.984375, + "step": 1735, + "think_completion_length": 41.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.59375, + "epoch": 2.930860033726813, + "grad_norm": 10.34660338793664, + "kl": 0.576171875, + "learning_rate": 4.145025295109612e-07, + "loss": 0.0006, + "reward": 3.636704921722412, + "reward_std": 0.02508683316409588, + "rewards/final_reward": 1.674860461087448, + "rewards/mask_iou_reward": 0.837430230543724, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.636704921722412, + "rewards/thk_ans_format_reward": 1.0, + "step": 1736, + "think_completion_length": 47.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.921875, + "epoch": 2.932546374367622, + "grad_norm": 15.646912978699548, + "kl": 0.71484375, + "learning_rate": 4.141652613827993e-07, + "loss": 0.0007, + "reward": 3.115605115890503, + "reward_std": 0.25399264693260193, + "rewards/final_reward": 1.125185816813323, + "rewards/mask_iou_reward": 0.5625929084066615, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1156051754951477, + "rewards/thk_ans_format_reward": 1.0, + "step": 1737, + "think_completion_length": 47.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.484375, + "epoch": 2.9342327150084317, + "grad_norm": 6.28798738746728, + "kl": 0.578125, + "learning_rate": 4.138279932546374e-07, + "loss": 0.0006, + "reward": 3.1532175540924072, + "reward_std": 0.1299862286541611, + "rewards/final_reward": 0.7318573581565188, + "rewards/mask_iou_reward": 0.3659286790782594, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1532175540924072, + "rewards/thk_ans_format_reward": 1.0, + "step": 1738, + "think_completion_length": 47.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.296875, + "epoch": 2.935919055649241, + "grad_norm": 11.787626130142819, + "kl": 0.630859375, + "learning_rate": 4.1349072512647553e-07, + "loss": 0.0006, + "reward": 3.4893749952316284, + "reward_std": 0.24706952273845673, + "rewards/final_reward": 1.8051508827886227, + "rewards/mask_iou_reward": 0.9025754413943113, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4893749952316284, + "rewards/thk_ans_format_reward": 1.0, + "step": 1739, + "think_completion_length": 49.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.65625, + "epoch": 2.9376053962900506, + "grad_norm": 7.832585841665771, + "kl": 0.72265625, + "learning_rate": 4.131534569983137e-07, + "loss": 0.0007, + "reward": 3.078925609588623, + "reward_std": 0.08913594763725996, + "rewards/final_reward": 1.248415469949072, + "rewards/mask_iou_reward": 0.624207734974536, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0789256691932678, + "rewards/thk_ans_format_reward": 1.0, + "step": 1740, + "think_completion_length": 46.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.484375, + "epoch": 2.93929173693086, + "grad_norm": 16.07860759176254, + "kl": 0.65625, + "learning_rate": 4.1281618887015176e-07, + "loss": 0.0007, + "reward": 3.5517622232437134, + "reward_std": 0.13729829341173172, + "rewards/final_reward": 1.441280645538618, + "rewards/mask_iou_reward": 0.720640322769309, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5517622232437134, + "rewards/thk_ans_format_reward": 1.0, + "step": 1741, + "think_completion_length": 43.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.015625, + "epoch": 2.9409780775716694, + "grad_norm": 7.257305752944733, + "kl": 0.5380859375, + "learning_rate": 4.1247892074198985e-07, + "loss": 0.0005, + "reward": 3.3132989406585693, + "reward_std": 0.2520294189453125, + "rewards/final_reward": 1.6513091533377802, + "rewards/mask_iou_reward": 0.8256545766688901, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3132989704608917, + "rewards/thk_ans_format_reward": 1.0, + "step": 1742, + "think_completion_length": 41.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.21875, + "epoch": 2.942664418212479, + "grad_norm": 7.221074397046372, + "kl": 0.650390625, + "learning_rate": 4.12141652613828e-07, + "loss": 0.0006, + "reward": 3.4602547883987427, + "reward_std": 0.16891072317957878, + "rewards/final_reward": 1.589578745008967, + "rewards/mask_iou_reward": 0.7947893725044834, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4602547883987427, + "rewards/thk_ans_format_reward": 1.0, + "step": 1743, + "think_completion_length": 45.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.0, + "epoch": 2.9443507588532882, + "grad_norm": 15.417697002872668, + "kl": 0.572265625, + "learning_rate": 4.118043844856661e-07, + "loss": 0.0006, + "reward": 3.5061349868774414, + "reward_std": 0.15799922496080399, + "rewards/final_reward": 1.4792684918120789, + "rewards/mask_iou_reward": 0.7396342459060394, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5061350464820862, + "rewards/thk_ans_format_reward": 1.0, + "step": 1744, + "think_completion_length": 46.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.625, + "epoch": 2.946037099494098, + "grad_norm": 5.94093777546409, + "kl": 0.59375, + "learning_rate": 4.114671163575042e-07, + "loss": 0.0006, + "reward": 3.624622106552124, + "reward_std": 0.04693530406802893, + "rewards/final_reward": 1.4988867171201967, + "rewards/mask_iou_reward": 0.7494433585600984, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6246221661567688, + "rewards/thk_ans_format_reward": 1.0, + "step": 1745, + "think_completion_length": 43.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.15625, + "epoch": 2.947723440134907, + "grad_norm": 7.926401098507347, + "kl": 0.59765625, + "learning_rate": 4.111298482293423e-07, + "loss": 0.0006, + "reward": 3.3583085536956787, + "reward_std": 0.22104869782924652, + "rewards/final_reward": 1.0619630684221244, + "rewards/mask_iou_reward": 0.5309815342110622, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3583085536956787, + "rewards/thk_ans_format_reward": 1.0, + "step": 1746, + "think_completion_length": 46.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.765625, + "epoch": 2.9494097807757167, + "grad_norm": 6.545742287407206, + "kl": 0.5390625, + "learning_rate": 4.107925801011804e-07, + "loss": 0.0005, + "reward": 3.228265881538391, + "reward_std": 0.20059365965425968, + "rewards/final_reward": 1.4879712694947818, + "rewards/mask_iou_reward": 0.7439856347473909, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2282660007476807, + "rewards/thk_ans_format_reward": 1.0, + "step": 1747, + "think_completion_length": 44.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.828125, + "epoch": 2.9510961214165263, + "grad_norm": 10.569300190662831, + "kl": 0.4287109375, + "learning_rate": 4.1045531197301853e-07, + "loss": 0.0003, + "reward": 3.587049722671509, + "reward_std": 0.06058724317699671, + "rewards/final_reward": 1.2193715290183142, + "rewards/mask_iou_reward": 0.6096857645091571, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5870497226715088, + "rewards/thk_ans_format_reward": 1.0, + "step": 1748, + "think_completion_length": 47.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.578125, + "epoch": 2.9527824620573355, + "grad_norm": 14.96704643685889, + "kl": 0.478515625, + "learning_rate": 4.1011804384485667e-07, + "loss": 0.0005, + "reward": 3.2427884340286255, + "reward_std": 0.026081462390720844, + "rewards/final_reward": 1.0459886978186481, + "rewards/mask_iou_reward": 0.5229943489093241, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.242788314819336, + "rewards/thk_ans_format_reward": 1.0, + "step": 1749, + "think_completion_length": 47.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.484375, + "epoch": 2.954468802698145, + "grad_norm": 7.001821709693498, + "kl": 0.55078125, + "learning_rate": 4.097807757166947e-07, + "loss": 0.0005, + "reward": 3.2535473108291626, + "reward_std": 0.057197438552975655, + "rewards/final_reward": 1.3538752702792123, + "rewards/mask_iou_reward": 0.6769376351396061, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.253547340631485, + "rewards/thk_ans_format_reward": 1.0, + "step": 1750, + "think_completion_length": 42.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.953125, + "epoch": 2.9561551433389543, + "grad_norm": 11.32756564935043, + "kl": 0.55859375, + "learning_rate": 4.0944350758853285e-07, + "loss": 0.0006, + "reward": 3.7669692039489746, + "reward_std": 0.03139904234558344, + "rewards/final_reward": 1.6746801459710268, + "rewards/mask_iou_reward": 0.8373400729855134, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7669692635536194, + "rewards/thk_ans_format_reward": 1.0, + "step": 1751, + "think_completion_length": 42.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.671875, + "epoch": 2.957841483979764, + "grad_norm": 5.598772715073544, + "kl": 0.587890625, + "learning_rate": 4.09106239460371e-07, + "loss": 0.0006, + "reward": 3.5666593313217163, + "reward_std": 0.09166042506694794, + "rewards/final_reward": 1.5495096974513012, + "rewards/mask_iou_reward": 0.7747548487256506, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5666593313217163, + "rewards/thk_ans_format_reward": 1.0, + "step": 1752, + "think_completion_length": 46.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.953125, + "epoch": 2.959527824620573, + "grad_norm": 13.584137909097144, + "kl": 0.6875, + "learning_rate": 4.0876897133220913e-07, + "loss": 0.0007, + "reward": 3.685564637184143, + "reward_std": 0.043231220450252295, + "rewards/final_reward": 1.5597897837836001, + "rewards/mask_iou_reward": 0.7798948918918001, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.685564637184143, + "rewards/thk_ans_format_reward": 1.0, + "step": 1753, + "think_completion_length": 44.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 241.140625, + "epoch": 2.961214165261383, + "grad_norm": 22.40003494317226, + "kl": 0.552734375, + "learning_rate": 4.0843170320404716e-07, + "loss": 0.0006, + "reward": 3.3257750272750854, + "reward_std": 0.2577382028102875, + "rewards/final_reward": 1.2813706582980735, + "rewards/mask_iou_reward": 0.6406853291490368, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3257750272750854, + "rewards/thk_ans_format_reward": 1.0, + "step": 1754, + "think_completion_length": 45.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.921875, + "epoch": 2.9629005059021924, + "grad_norm": 26.131581991547865, + "kl": 0.673828125, + "learning_rate": 4.080944350758853e-07, + "loss": 0.0007, + "reward": 3.4210046529769897, + "reward_std": 0.24233996961265802, + "rewards/final_reward": 1.1368643662064861, + "rewards/mask_iou_reward": 0.5684321831032431, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4210045337677002, + "rewards/thk_ans_format_reward": 1.0, + "step": 1755, + "think_completion_length": 43.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.078125, + "epoch": 2.9645868465430016, + "grad_norm": 11.690378243373605, + "kl": 0.546875, + "learning_rate": 4.0775716694772344e-07, + "loss": 0.0006, + "reward": 3.2714394330978394, + "reward_std": 0.43962132930755615, + "rewards/final_reward": 0.7825476837291065, + "rewards/mask_iou_reward": 0.3912738418645533, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2714394330978394, + "rewards/thk_ans_format_reward": 1.0, + "step": 1756, + "think_completion_length": 49.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.609375, + "epoch": 2.9662731871838113, + "grad_norm": 25.324780980102272, + "kl": 0.470703125, + "learning_rate": 4.0741989881956153e-07, + "loss": 0.0005, + "reward": 3.5654423236846924, + "reward_std": 0.06877763196825981, + "rewards/final_reward": 1.6263870640270404, + "rewards/mask_iou_reward": 0.8131935320135202, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5654422640800476, + "rewards/thk_ans_format_reward": 1.0, + "step": 1757, + "think_completion_length": 46.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.75, + "epoch": 2.9679595278246205, + "grad_norm": 4.14111248427749, + "kl": 0.537109375, + "learning_rate": 4.070826306913996e-07, + "loss": 0.0005, + "reward": 3.3682453632354736, + "reward_std": 0.16081257164478302, + "rewards/final_reward": 1.6700165070119228, + "rewards/mask_iou_reward": 0.8350082535059614, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3682452738285065, + "rewards/thk_ans_format_reward": 1.0, + "step": 1758, + "think_completion_length": 45.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.109375, + "epoch": 2.96964586846543, + "grad_norm": 6.129164184058998, + "kl": 0.5380859375, + "learning_rate": 4.0674536256323776e-07, + "loss": 0.0005, + "reward": 3.21283495426178, + "reward_std": 0.12503460049629211, + "rewards/final_reward": 1.8603373570177637, + "rewards/mask_iou_reward": 0.9301686785088819, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2128351330757141, + "rewards/thk_ans_format_reward": 1.0, + "step": 1759, + "think_completion_length": 46.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.9375, + "epoch": 2.9713322091062393, + "grad_norm": 13.32716958264156, + "kl": 0.56640625, + "learning_rate": 4.0640809443507585e-07, + "loss": 0.0006, + "reward": 3.5802561044692993, + "reward_std": 0.13087457790970802, + "rewards/final_reward": 1.5221519900021883, + "rewards/mask_iou_reward": 0.7610759950010941, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5802561044692993, + "rewards/thk_ans_format_reward": 1.0, + "step": 1760, + "think_completion_length": 47.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.984375, + "epoch": 2.973018549747049, + "grad_norm": 5.719532211581828, + "kl": 0.529296875, + "learning_rate": 4.06070826306914e-07, + "loss": 0.0005, + "reward": 3.140426754951477, + "reward_std": 0.013218533713370562, + "rewards/final_reward": 1.105653311013493, + "rewards/mask_iou_reward": 0.5528266555067465, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.140426754951477, + "rewards/thk_ans_format_reward": 1.0, + "step": 1761, + "think_completion_length": 48.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.78125, + "epoch": 2.9747048903878586, + "grad_norm": 8.80106383889409, + "kl": 0.611328125, + "learning_rate": 4.057335581787521e-07, + "loss": 0.0006, + "reward": 3.4417465925216675, + "reward_std": 0.09109633043408394, + "rewards/final_reward": 1.3834932937002937, + "rewards/mask_iou_reward": 0.6917466468501469, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.441746711730957, + "rewards/thk_ans_format_reward": 1.0, + "step": 1762, + "think_completion_length": 46.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.96875, + "epoch": 2.9763912310286678, + "grad_norm": 5.198333548111168, + "kl": 0.603515625, + "learning_rate": 4.053962900505902e-07, + "loss": 0.0006, + "reward": 3.007934093475342, + "reward_std": 0.30538563430309296, + "rewards/final_reward": 0.8716766112194474, + "rewards/mask_iou_reward": 0.4358383056097237, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0079339742660522, + "rewards/thk_ans_format_reward": 1.0, + "step": 1763, + "think_completion_length": 43.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.46875, + "epoch": 2.9780775716694774, + "grad_norm": 8.420377628718501, + "kl": 0.52734375, + "learning_rate": 4.050590219224283e-07, + "loss": 0.0005, + "reward": 2.945202946662903, + "reward_std": 0.10134740360081196, + "rewards/final_reward": 1.0092593098713687, + "rewards/mask_iou_reward": 0.5046296549356843, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9452029466629028, + "rewards/thk_ans_format_reward": 1.0, + "step": 1764, + "think_completion_length": 45.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.453125, + "epoch": 2.9797639123102866, + "grad_norm": 6.42537877319895, + "kl": 0.580078125, + "learning_rate": 4.0472175379426644e-07, + "loss": 0.0006, + "reward": 2.7219390869140625, + "reward_std": 0.10427241958677769, + "rewards/final_reward": 0.5188378030255072, + "rewards/mask_iou_reward": 0.2594189015127536, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.721939206123352, + "rewards/thk_ans_format_reward": 1.0, + "step": 1765, + "think_completion_length": 41.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.78125, + "epoch": 2.9814502529510962, + "grad_norm": 9.522081916215093, + "kl": 0.5546875, + "learning_rate": 4.043844856661046e-07, + "loss": 0.0006, + "reward": 3.8684887886047363, + "reward_std": 0.01769642811268568, + "rewards/final_reward": 1.8679418511744035, + "rewards/mask_iou_reward": 0.9339709255872017, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8684889078140259, + "rewards/thk_ans_format_reward": 1.0, + "step": 1766, + "think_completion_length": 46.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.84375, + "epoch": 2.9831365935919054, + "grad_norm": 13.612631308119484, + "kl": 0.630859375, + "learning_rate": 4.040472175379426e-07, + "loss": 0.0006, + "reward": 3.1474589109420776, + "reward_std": 0.029912306927144527, + "rewards/final_reward": 1.2405054257944175, + "rewards/mask_iou_reward": 0.6202527128972087, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1474591195583344, + "rewards/thk_ans_format_reward": 1.0, + "step": 1767, + "think_completion_length": 43.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.578125, + "epoch": 2.984822934232715, + "grad_norm": 19.224794027956456, + "kl": 0.5224609375, + "learning_rate": 4.0370994940978076e-07, + "loss": 0.0005, + "reward": 3.060731887817383, + "reward_std": 0.10396159812808037, + "rewards/final_reward": 1.2167539201734914, + "rewards/mask_iou_reward": 0.6083769600867457, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.060731828212738, + "rewards/thk_ans_format_reward": 1.0, + "step": 1768, + "think_completion_length": 37.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.796875, + "epoch": 2.9865092748735247, + "grad_norm": 7.278733591497656, + "kl": 0.4716796875, + "learning_rate": 4.033726812816189e-07, + "loss": 0.0004, + "reward": 3.6651105880737305, + "reward_std": 0.30302999913692474, + "rewards/final_reward": 1.6531495825342206, + "rewards/mask_iou_reward": 0.8265747912671103, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.6807357668876648, + "rewards/thk_ans_format_reward": 1.0, + "step": 1769, + "think_completion_length": 49.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.0625, + "epoch": 2.988195615514334, + "grad_norm": 15.610608125179827, + "kl": 0.609375, + "learning_rate": 4.03035413153457e-07, + "loss": 0.0006, + "reward": 3.306709885597229, + "reward_std": 0.160341314971447, + "rewards/final_reward": 1.3282952467844442, + "rewards/mask_iou_reward": 0.6641476233922221, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.306709885597229, + "rewards/thk_ans_format_reward": 1.0, + "step": 1770, + "think_completion_length": 43.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.8125, + "epoch": 2.989881956155143, + "grad_norm": 19.309088968802552, + "kl": 0.576171875, + "learning_rate": 4.0269814502529507e-07, + "loss": 0.0006, + "reward": 3.4969851970672607, + "reward_std": 0.049538787454366684, + "rewards/final_reward": 1.8811737143384755, + "rewards/mask_iou_reward": 0.9405868571692377, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4969850778579712, + "rewards/thk_ans_format_reward": 1.0, + "step": 1771, + "think_completion_length": 45.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.921875, + "epoch": 2.9915682967959527, + "grad_norm": 8.792433933486913, + "kl": 0.4873046875, + "learning_rate": 4.023608768971332e-07, + "loss": 0.0005, + "reward": 3.364785671234131, + "reward_std": 0.1412600614130497, + "rewards/final_reward": 1.4665705126725315, + "rewards/mask_iou_reward": 0.7332852563362657, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.364785611629486, + "rewards/thk_ans_format_reward": 1.0, + "step": 1772, + "think_completion_length": 38.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.625, + "epoch": 2.9932546374367623, + "grad_norm": 6.145988479809462, + "kl": 0.58203125, + "learning_rate": 4.020236087689713e-07, + "loss": 0.0006, + "reward": 3.1206058263778687, + "reward_std": 0.20862603932619095, + "rewards/final_reward": 1.0969574547093313, + "rewards/mask_iou_reward": 0.5484787273546656, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1206059157848358, + "rewards/thk_ans_format_reward": 1.0, + "step": 1773, + "think_completion_length": 45.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.1875, + "epoch": 2.9949409780775715, + "grad_norm": 9.213062201212532, + "kl": 0.5859375, + "learning_rate": 4.0168634064080944e-07, + "loss": 0.0006, + "reward": 3.3638638257980347, + "reward_std": 0.18089959397912025, + "rewards/final_reward": 1.077929304204421, + "rewards/mask_iou_reward": 0.5389646521022105, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3638638854026794, + "rewards/thk_ans_format_reward": 1.0, + "step": 1774, + "think_completion_length": 42.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.046875, + "epoch": 2.996627318718381, + "grad_norm": 11.1703013355821, + "kl": 0.59375, + "learning_rate": 4.013490725126475e-07, + "loss": 0.0006, + "reward": 2.8246114253997803, + "reward_std": 0.32476918399333954, + "rewards/final_reward": 1.1536238002372674, + "rewards/mask_iou_reward": 0.5768119001186337, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.8558615148067474, + "rewards/thk_ans_format_reward": 0.984375, + "step": 1775, + "think_completion_length": 46.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.66666793823242, + "epoch": 2.998313659359191, + "grad_norm": 8.430944669515075, + "kl": 0.634765625, + "learning_rate": 4.0101180438448567e-07, + "loss": 0.0007, + "reward": 3.570656657218933, + "reward_std": 0.26290661143139005, + "rewards/final_reward": 1.7582903216175922, + "rewards/mask_iou_reward": 0.8791451608087961, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5706565380096436, + "rewards/thk_ans_format_reward": 1.0, + "step": 1776, + "think_completion_length": 39.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.125, + "epoch": 3.0016863406408096, + "grad_norm": 7.531012330436627, + "kl": 0.63671875, + "learning_rate": 4.0067453625632375e-07, + "loss": 0.0006, + "reward": 3.4569878578186035, + "reward_std": 0.05659590847790241, + "rewards/final_reward": 1.430502925295769, + "rewards/mask_iou_reward": 0.7152514626478845, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.456987977027893, + "rewards/thk_ans_format_reward": 1.0, + "step": 1777, + "think_completion_length": 43.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.265625, + "epoch": 3.003372681281619, + "grad_norm": 12.588023913728449, + "kl": 0.904296875, + "learning_rate": 4.003372681281619e-07, + "loss": 0.0009, + "reward": 2.8988207578659058, + "reward_std": 0.06972062401473522, + "rewards/final_reward": 0.8970200385350756, + "rewards/mask_iou_reward": 0.4485100192675378, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8988207578659058, + "rewards/thk_ans_format_reward": 1.0, + "step": 1778, + "think_completion_length": 41.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.28125, + "epoch": 3.0050590219224285, + "grad_norm": 7.991088506498573, + "kl": 0.6171875, + "learning_rate": 4e-07, + "loss": 0.0006, + "reward": 3.4317870140075684, + "reward_std": 0.06494680885225534, + "rewards/final_reward": 1.5347575528414017, + "rewards/mask_iou_reward": 0.7673787764207008, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4317869544029236, + "rewards/thk_ans_format_reward": 1.0, + "step": 1779, + "think_completion_length": 42.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.6875, + "epoch": 3.0067453625632377, + "grad_norm": 6.296840375046232, + "kl": 0.58984375, + "learning_rate": 3.9966273187183807e-07, + "loss": 0.0006, + "reward": 3.519545555114746, + "reward_std": 0.0419915160164237, + "rewards/final_reward": 1.8707376427400908, + "rewards/mask_iou_reward": 0.9353688213700454, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.519545555114746, + "rewards/thk_ans_format_reward": 1.0, + "step": 1780, + "think_completion_length": 43.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.015625, + "epoch": 3.0084317032040473, + "grad_norm": 10.765617905439093, + "kl": 0.62890625, + "learning_rate": 3.993254637436762e-07, + "loss": 0.0006, + "reward": 2.860144257545471, + "reward_std": 0.21056190133094788, + "rewards/final_reward": 0.46607457041584754, + "rewards/mask_iou_reward": 0.23303728520792377, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8601441979408264, + "rewards/thk_ans_format_reward": 1.0, + "step": 1781, + "think_completion_length": 42.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.21875, + "epoch": 3.0101180438448565, + "grad_norm": 12.267309726746078, + "kl": 0.59375, + "learning_rate": 3.9898819561551435e-07, + "loss": 0.0006, + "reward": 3.506497859954834, + "reward_std": 0.07470344379544258, + "rewards/final_reward": 1.2161605416955577, + "rewards/mask_iou_reward": 0.6080802708477788, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.506497859954834, + "rewards/thk_ans_format_reward": 1.0, + "step": 1782, + "think_completion_length": 38.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.109375, + "epoch": 3.011804384485666, + "grad_norm": 5.569775037193244, + "kl": 0.609375, + "learning_rate": 3.986509274873524e-07, + "loss": 0.0006, + "reward": 3.2471498250961304, + "reward_std": 0.023415432777255774, + "rewards/final_reward": 1.3677550518400219, + "rewards/mask_iou_reward": 0.6838775259200109, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2471499741077423, + "rewards/thk_ans_format_reward": 1.0, + "step": 1783, + "think_completion_length": 42.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.1875, + "epoch": 3.0134907251264758, + "grad_norm": 13.149543840594943, + "kl": 0.6640625, + "learning_rate": 3.983136593591905e-07, + "loss": 0.0007, + "reward": 3.5586479902267456, + "reward_std": 0.0933561883866787, + "rewards/final_reward": 1.5804140872277448, + "rewards/mask_iou_reward": 0.7902070436138724, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5586479306221008, + "rewards/thk_ans_format_reward": 1.0, + "step": 1784, + "think_completion_length": 40.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.265625, + "epoch": 3.015177065767285, + "grad_norm": 6.182000400527594, + "kl": 0.7109375, + "learning_rate": 3.9797639123102867e-07, + "loss": 0.0007, + "reward": 3.558907985687256, + "reward_std": 0.04706683196127415, + "rewards/final_reward": 1.738189228029709, + "rewards/mask_iou_reward": 0.8690946140148545, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5589080452919006, + "rewards/thk_ans_format_reward": 1.0, + "step": 1785, + "think_completion_length": 38.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.515625, + "epoch": 3.0168634064080946, + "grad_norm": 18.46072762580063, + "kl": 0.5625, + "learning_rate": 3.9763912310286675e-07, + "loss": 0.0006, + "reward": 3.6227529048919678, + "reward_std": 0.2536686926614493, + "rewards/final_reward": 1.8278806704202584, + "rewards/mask_iou_reward": 0.9139403352101292, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6227527856826782, + "rewards/thk_ans_format_reward": 1.0, + "step": 1786, + "think_completion_length": 43.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.515625, + "epoch": 3.0185497470489038, + "grad_norm": 5.981655691507406, + "kl": 0.6484375, + "learning_rate": 3.9730185497470484e-07, + "loss": 0.0006, + "reward": 3.110731363296509, + "reward_std": 0.10306009650230408, + "rewards/final_reward": 1.2463825041064651, + "rewards/mask_iou_reward": 0.6231912520532326, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1107314229011536, + "rewards/thk_ans_format_reward": 1.0, + "step": 1787, + "think_completion_length": 47.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.578125, + "epoch": 3.0202360876897134, + "grad_norm": 19.93758781645736, + "kl": 0.70703125, + "learning_rate": 3.96964586846543e-07, + "loss": 0.0007, + "reward": 2.9761908054351807, + "reward_std": 0.2858365625143051, + "rewards/final_reward": 1.15384431231489, + "rewards/mask_iou_reward": 0.576922156157445, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.007440596818924, + "rewards/thk_ans_format_reward": 0.984375, + "step": 1788, + "think_completion_length": 44.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.84375, + "epoch": 3.0219224283305226, + "grad_norm": 8.834083836552558, + "kl": 0.576171875, + "learning_rate": 3.966273187183811e-07, + "loss": 0.0006, + "reward": 3.8274420499801636, + "reward_std": 0.04052088037133217, + "rewards/final_reward": 1.8624797041000785, + "rewards/mask_iou_reward": 0.9312398520500392, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8274420499801636, + "rewards/thk_ans_format_reward": 1.0, + "step": 1789, + "think_completion_length": 40.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.71875, + "epoch": 3.0236087689713322, + "grad_norm": 7.803750825223315, + "kl": 0.59765625, + "learning_rate": 3.962900505902192e-07, + "loss": 0.0005, + "reward": 2.902255654335022, + "reward_std": 0.09178433939814568, + "rewards/final_reward": 1.4428867109175392, + "rewards/mask_iou_reward": 0.7214433554587696, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9022556245326996, + "rewards/thk_ans_format_reward": 1.0, + "step": 1790, + "think_completion_length": 41.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.234375, + "epoch": 3.0252951096121414, + "grad_norm": 7.234996353582277, + "kl": 0.953125, + "learning_rate": 3.9595278246205735e-07, + "loss": 0.0009, + "reward": 3.684625029563904, + "reward_std": 0.034050445072352886, + "rewards/final_reward": 1.7472793780932192, + "rewards/mask_iou_reward": 0.8736396890466096, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6846250891685486, + "rewards/thk_ans_format_reward": 1.0, + "step": 1791, + "think_completion_length": 39.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.484375, + "epoch": 3.026981450252951, + "grad_norm": 7.483545957839358, + "kl": 0.609375, + "learning_rate": 3.9561551433389544e-07, + "loss": 0.0006, + "reward": 3.345807671546936, + "reward_std": 0.21587074548006058, + "rewards/final_reward": 1.7519694489910984, + "rewards/mask_iou_reward": 0.8759847244955492, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.345807671546936, + "rewards/thk_ans_format_reward": 1.0, + "step": 1792, + "think_completion_length": 39.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.546875, + "epoch": 3.0286677908937607, + "grad_norm": 9.931220248630632, + "kl": 0.609375, + "learning_rate": 3.952782462057335e-07, + "loss": 0.0006, + "reward": 3.640244960784912, + "reward_std": 0.11672806553542614, + "rewards/final_reward": 1.5120892039081364, + "rewards/mask_iou_reward": 0.7560446019540682, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6402450799942017, + "rewards/thk_ans_format_reward": 1.0, + "step": 1793, + "think_completion_length": 40.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.640625, + "epoch": 3.03035413153457, + "grad_norm": 26.60166896179453, + "kl": 0.54296875, + "learning_rate": 3.9494097807757166e-07, + "loss": 0.0006, + "reward": 3.89884877204895, + "reward_std": 0.1477795336395502, + "rewards/final_reward": 1.89302786543562, + "rewards/mask_iou_reward": 0.94651393271781, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.898848831653595, + "rewards/thk_ans_format_reward": 1.0, + "step": 1794, + "think_completion_length": 43.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.0, + "epoch": 3.0320404721753795, + "grad_norm": 14.90534750812744, + "kl": 0.6015625, + "learning_rate": 3.946037099494098e-07, + "loss": 0.0006, + "reward": 3.7795809507369995, + "reward_std": 0.018881912576034665, + "rewards/final_reward": 1.8978212875884442, + "rewards/mask_iou_reward": 0.9489106437942221, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.77958083152771, + "rewards/thk_ans_format_reward": 1.0, + "step": 1795, + "think_completion_length": 43.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.4375, + "epoch": 3.0337268128161887, + "grad_norm": 7.302222131132335, + "kl": 0.537109375, + "learning_rate": 3.9426644182124784e-07, + "loss": 0.0005, + "reward": 3.4302122592926025, + "reward_std": 0.11048243194818497, + "rewards/final_reward": 1.775548221195335, + "rewards/mask_iou_reward": 0.8877741105976675, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.430212378501892, + "rewards/thk_ans_format_reward": 1.0, + "step": 1796, + "think_completion_length": 41.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.953125, + "epoch": 3.0354131534569984, + "grad_norm": 8.81006701656291, + "kl": 0.595703125, + "learning_rate": 3.93929173693086e-07, + "loss": 0.0006, + "reward": 3.185991048812866, + "reward_std": 0.052945384522899985, + "rewards/final_reward": 0.7679530304436115, + "rewards/mask_iou_reward": 0.38397651522180576, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.185991108417511, + "rewards/thk_ans_format_reward": 1.0, + "step": 1797, + "think_completion_length": 38.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.640625, + "epoch": 3.0370994940978076, + "grad_norm": 5.821901064823728, + "kl": 0.640625, + "learning_rate": 3.935919055649241e-07, + "loss": 0.0007, + "reward": 3.6197181940078735, + "reward_std": 0.035361507907509804, + "rewards/final_reward": 1.424851699363638, + "rewards/mask_iou_reward": 0.712425849681819, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.619718074798584, + "rewards/thk_ans_format_reward": 1.0, + "step": 1798, + "think_completion_length": 37.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.546875, + "epoch": 3.038785834738617, + "grad_norm": 10.127292142360464, + "kl": 0.548828125, + "learning_rate": 3.932546374367622e-07, + "loss": 0.0005, + "reward": 3.2065749168395996, + "reward_std": 0.18304883688688278, + "rewards/final_reward": 1.2521124267152612, + "rewards/mask_iou_reward": 0.6260562133576306, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2065749168395996, + "rewards/thk_ans_format_reward": 1.0, + "step": 1799, + "think_completion_length": 42.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.40625, + "epoch": 3.040472175379427, + "grad_norm": 8.878520520027035, + "kl": 0.564453125, + "learning_rate": 3.929173693086003e-07, + "loss": 0.0006, + "reward": 3.5881153345108032, + "reward_std": 0.03201424656435847, + "rewards/final_reward": 1.4877122646509164, + "rewards/mask_iou_reward": 0.7438561323254582, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5881155133247375, + "rewards/thk_ans_format_reward": 1.0, + "step": 1800, + "think_completion_length": 40.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.90625, + "epoch": 3.042158516020236, + "grad_norm": 7.864526301429089, + "kl": 0.587890625, + "learning_rate": 3.9258010118043843e-07, + "loss": 0.0005, + "reward": 3.623053550720215, + "reward_std": 0.2628798196092248, + "rewards/final_reward": 1.631788901154074, + "rewards/mask_iou_reward": 0.815894450577037, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6230534315109253, + "rewards/thk_ans_format_reward": 1.0, + "step": 1801, + "think_completion_length": 45.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.296875, + "epoch": 3.0438448566610457, + "grad_norm": 7.755192761497758, + "kl": 0.595703125, + "learning_rate": 3.922428330522766e-07, + "loss": 0.0006, + "reward": 3.6610530614852905, + "reward_std": 0.06777806580066681, + "rewards/final_reward": 1.803045456398975, + "rewards/mask_iou_reward": 0.9015227281994875, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6610528826713562, + "rewards/thk_ans_format_reward": 1.0, + "step": 1802, + "think_completion_length": 45.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.53125, + "epoch": 3.045531197301855, + "grad_norm": 11.508893233818217, + "kl": 0.708984375, + "learning_rate": 3.9190556492411466e-07, + "loss": 0.0007, + "reward": 2.952297568321228, + "reward_std": 0.0924637708812952, + "rewards/final_reward": 0.9303527118823702, + "rewards/mask_iou_reward": 0.4651763559411851, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.952297568321228, + "rewards/thk_ans_format_reward": 1.0, + "step": 1803, + "think_completion_length": 44.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.578125, + "epoch": 3.0472175379426645, + "grad_norm": 8.761930391197803, + "kl": 0.625, + "learning_rate": 3.9156829679595275e-07, + "loss": 0.0006, + "reward": 3.518397331237793, + "reward_std": 0.10372760146856308, + "rewards/final_reward": 1.7520384498088726, + "rewards/mask_iou_reward": 0.8760192249044363, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5183972716331482, + "rewards/thk_ans_format_reward": 1.0, + "step": 1804, + "think_completion_length": 39.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.921875, + "epoch": 3.0489038785834737, + "grad_norm": 6.091115422282127, + "kl": 0.62109375, + "learning_rate": 3.912310286677909e-07, + "loss": 0.0006, + "reward": 3.6024467945098877, + "reward_std": 0.038248912431299686, + "rewards/final_reward": 1.5841322268045297, + "rewards/mask_iou_reward": 0.7920661134022648, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6024468541145325, + "rewards/thk_ans_format_reward": 1.0, + "step": 1805, + "think_completion_length": 43.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.765625, + "epoch": 3.0505902192242833, + "grad_norm": 10.536867458322101, + "kl": 0.583984375, + "learning_rate": 3.90893760539629e-07, + "loss": 0.0006, + "reward": 3.5221776962280273, + "reward_std": 0.19090192764997482, + "rewards/final_reward": 1.1545756872413946, + "rewards/mask_iou_reward": 0.5772878436206973, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5221776962280273, + "rewards/thk_ans_format_reward": 1.0, + "step": 1806, + "think_completion_length": 44.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.78125, + "epoch": 3.052276559865093, + "grad_norm": 5.970328374731551, + "kl": 0.517578125, + "learning_rate": 3.905564924114671e-07, + "loss": 0.0005, + "reward": 2.939510464668274, + "reward_std": 0.046596916392445564, + "rewards/final_reward": 1.0414819878594614, + "rewards/mask_iou_reward": 0.5207409939297307, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9395104348659515, + "rewards/thk_ans_format_reward": 1.0, + "step": 1807, + "think_completion_length": 40.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.046875, + "epoch": 3.053962900505902, + "grad_norm": 8.809238899912609, + "kl": 0.60546875, + "learning_rate": 3.902192242833052e-07, + "loss": 0.0006, + "reward": 3.550176501274109, + "reward_std": 0.18766392022371292, + "rewards/final_reward": 1.3816421309674876, + "rewards/mask_iou_reward": 0.6908210654837438, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5501765012741089, + "rewards/thk_ans_format_reward": 1.0, + "step": 1808, + "think_completion_length": 37.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.734375, + "epoch": 3.0556492411467118, + "grad_norm": 6.52671823087687, + "kl": 0.6796875, + "learning_rate": 3.898819561551433e-07, + "loss": 0.0007, + "reward": 3.23820424079895, + "reward_std": 0.14676055498421192, + "rewards/final_reward": 1.2919858793710988, + "rewards/mask_iou_reward": 0.6459929396855494, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2382042407989502, + "rewards/thk_ans_format_reward": 1.0, + "step": 1809, + "think_completion_length": 48.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.15625, + "epoch": 3.057335581787521, + "grad_norm": 11.07613980798843, + "kl": 0.55078125, + "learning_rate": 3.8954468802698143e-07, + "loss": 0.0006, + "reward": 3.0210577249526978, + "reward_std": 0.08838908141478896, + "rewards/final_reward": 1.0451431337686916, + "rewards/mask_iou_reward": 0.5225715668843458, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0210577249526978, + "rewards/thk_ans_format_reward": 1.0, + "step": 1810, + "think_completion_length": 43.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.578125, + "epoch": 3.0590219224283306, + "grad_norm": 16.112354584949998, + "kl": 0.5703125, + "learning_rate": 3.8920741989881957e-07, + "loss": 0.0006, + "reward": 3.423478841781616, + "reward_std": 0.012333399849012494, + "rewards/final_reward": 0.942286153388663, + "rewards/mask_iou_reward": 0.4711430766943315, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4234787225723267, + "rewards/thk_ans_format_reward": 1.0, + "step": 1811, + "think_completion_length": 42.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.625, + "epoch": 3.06070826306914, + "grad_norm": 10.626531813912603, + "kl": 0.638671875, + "learning_rate": 3.888701517706576e-07, + "loss": 0.0006, + "reward": 3.6403441429138184, + "reward_std": 0.022046887315809727, + "rewards/final_reward": 1.6076258115450064, + "rewards/mask_iou_reward": 0.8038129057725032, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6403440237045288, + "rewards/thk_ans_format_reward": 1.0, + "step": 1812, + "think_completion_length": 41.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.171875, + "epoch": 3.0623946037099494, + "grad_norm": 10.574157055501939, + "kl": 0.626953125, + "learning_rate": 3.8853288364249575e-07, + "loss": 0.0006, + "reward": 3.482790231704712, + "reward_std": 0.10968651808798313, + "rewards/final_reward": 1.584461325845456, + "rewards/mask_iou_reward": 0.792230662922728, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4827901124954224, + "rewards/thk_ans_format_reward": 1.0, + "step": 1813, + "think_completion_length": 41.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.625, + "epoch": 3.064080944350759, + "grad_norm": 8.79314057815561, + "kl": 0.580078125, + "learning_rate": 3.881956155143339e-07, + "loss": 0.0006, + "reward": 3.6398913860321045, + "reward_std": 0.2976074144244194, + "rewards/final_reward": 1.501840785814263, + "rewards/mask_iou_reward": 0.7509203929071315, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6398914456367493, + "rewards/thk_ans_format_reward": 1.0, + "step": 1814, + "think_completion_length": 44.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.515625, + "epoch": 3.0657672849915683, + "grad_norm": 6.234654076026071, + "kl": 0.529296875, + "learning_rate": 3.8785834738617203e-07, + "loss": 0.0005, + "reward": 3.3249377012252808, + "reward_std": 0.2078157588839531, + "rewards/final_reward": 1.3281841385162092, + "rewards/mask_iou_reward": 0.6640920692581046, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3249376714229584, + "rewards/thk_ans_format_reward": 1.0, + "step": 1815, + "think_completion_length": 40.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.0625, + "epoch": 3.067453625632378, + "grad_norm": 10.396544747347201, + "kl": 0.564453125, + "learning_rate": 3.8752107925801006e-07, + "loss": 0.0006, + "reward": 3.4935293197631836, + "reward_std": 0.04673771560192108, + "rewards/final_reward": 1.6538122748826627, + "rewards/mask_iou_reward": 0.8269061374413313, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4935293197631836, + "rewards/thk_ans_format_reward": 1.0, + "step": 1816, + "think_completion_length": 40.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.46875, + "epoch": 3.069139966273187, + "grad_norm": 21.842182739016284, + "kl": 0.544921875, + "learning_rate": 3.871838111298482e-07, + "loss": 0.0005, + "reward": 3.6569454669952393, + "reward_std": 0.14664340764284134, + "rewards/final_reward": 1.8561856677784982, + "rewards/mask_iou_reward": 0.9280928338892491, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6569453477859497, + "rewards/thk_ans_format_reward": 1.0, + "step": 1817, + "think_completion_length": 39.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.5625, + "epoch": 3.0708263069139967, + "grad_norm": 14.884495823563416, + "kl": 0.45703125, + "learning_rate": 3.8684654300168634e-07, + "loss": 0.0005, + "reward": 3.5717333555221558, + "reward_std": 0.16016625985503197, + "rewards/final_reward": 1.4241937334443704, + "rewards/mask_iou_reward": 0.7120968667221852, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.571733295917511, + "rewards/thk_ans_format_reward": 1.0, + "step": 1818, + "think_completion_length": 43.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.40625, + "epoch": 3.072512647554806, + "grad_norm": 100.86990875217444, + "kl": 0.65234375, + "learning_rate": 3.8650927487352443e-07, + "loss": 0.0007, + "reward": 3.7288215160369873, + "reward_std": 0.055927949026227, + "rewards/final_reward": 1.616559484813261, + "rewards/mask_iou_reward": 0.8082797424066305, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.728821575641632, + "rewards/thk_ans_format_reward": 1.0, + "step": 1819, + "think_completion_length": 43.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.125, + "epoch": 3.0741989881956155, + "grad_norm": 34.84247946910707, + "kl": 0.59375, + "learning_rate": 3.861720067453625e-07, + "loss": 0.0006, + "reward": 3.3209747076034546, + "reward_std": 0.15957476571202278, + "rewards/final_reward": 1.6474255953448638, + "rewards/mask_iou_reward": 0.8237127976724319, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3209747672080994, + "rewards/thk_ans_format_reward": 1.0, + "step": 1820, + "think_completion_length": 47.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.890625, + "epoch": 3.075885328836425, + "grad_norm": 9.821093622718976, + "kl": 0.541015625, + "learning_rate": 3.8583473861720066e-07, + "loss": 0.0005, + "reward": 3.1136986017227173, + "reward_std": 0.3179262578487396, + "rewards/final_reward": 1.0161626959025258, + "rewards/mask_iou_reward": 0.5080813479512629, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1136985421180725, + "rewards/thk_ans_format_reward": 1.0, + "step": 1821, + "think_completion_length": 50.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.71875, + "epoch": 3.0775716694772344, + "grad_norm": 8.86875330195681, + "kl": 0.609375, + "learning_rate": 3.8549747048903875e-07, + "loss": 0.0006, + "reward": 3.530484437942505, + "reward_std": 0.10517753660678864, + "rewards/final_reward": 1.6550108728196373, + "rewards/mask_iou_reward": 0.8275054364098187, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5304844379425049, + "rewards/thk_ans_format_reward": 1.0, + "step": 1822, + "think_completion_length": 40.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.640625, + "epoch": 3.079258010118044, + "grad_norm": 10.663893597077623, + "kl": 0.541015625, + "learning_rate": 3.851602023608769e-07, + "loss": 0.0005, + "reward": 3.1298916339874268, + "reward_std": 0.1926565244793892, + "rewards/final_reward": 1.1494982733137804, + "rewards/mask_iou_reward": 0.5747491366568902, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1298914551734924, + "rewards/thk_ans_format_reward": 1.0, + "step": 1823, + "think_completion_length": 45.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.53125, + "epoch": 3.080944350758853, + "grad_norm": 5.634180631015267, + "kl": 0.54296875, + "learning_rate": 3.84822934232715e-07, + "loss": 0.0005, + "reward": 3.306205630302429, + "reward_std": 0.134456398896873, + "rewards/final_reward": 1.1425007583452398, + "rewards/mask_iou_reward": 0.5712503791726199, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3062056303024292, + "rewards/thk_ans_format_reward": 1.0, + "step": 1824, + "think_completion_length": 46.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.875, + "epoch": 3.082630691399663, + "grad_norm": 9.726825128136838, + "kl": 1.09375, + "learning_rate": 3.8448566610455306e-07, + "loss": 0.001, + "reward": 3.8346996307373047, + "reward_std": 0.02099014213308692, + "rewards/final_reward": 1.9401270741644736, + "rewards/mask_iou_reward": 0.9700635370822368, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.834699809551239, + "rewards/thk_ans_format_reward": 1.0, + "step": 1825, + "think_completion_length": 46.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.296875, + "epoch": 3.084317032040472, + "grad_norm": 8.46581095839141, + "kl": 0.578125, + "learning_rate": 3.841483979763912e-07, + "loss": 0.0006, + "reward": 3.462043046951294, + "reward_std": 0.1276659220457077, + "rewards/final_reward": 1.5772126033320073, + "rewards/mask_iou_reward": 0.7886063016660037, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4620429277420044, + "rewards/thk_ans_format_reward": 1.0, + "step": 1826, + "think_completion_length": 44.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.640625, + "epoch": 3.0860033726812817, + "grad_norm": 6.099141209808028, + "kl": 0.4775390625, + "learning_rate": 3.8381112984822934e-07, + "loss": 0.0005, + "reward": 3.5183308124542236, + "reward_std": 0.0571708045899868, + "rewards/final_reward": 1.3577895920311414, + "rewards/mask_iou_reward": 0.6788947960155707, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.518330991268158, + "rewards/thk_ans_format_reward": 1.0, + "step": 1827, + "think_completion_length": 44.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.375, + "epoch": 3.087689713322091, + "grad_norm": 6.6582942751110705, + "kl": 0.5400390625, + "learning_rate": 3.834738617200675e-07, + "loss": 0.0005, + "reward": 3.124674081802368, + "reward_std": 0.617139033973217, + "rewards/final_reward": 1.416433364585445, + "rewards/mask_iou_reward": 0.7082166822927225, + "rewards/sam_format_reward": 0.9375, + "rewards/sam_reward_func_ultra": 1.249674141407013, + "rewards/thk_ans_format_reward": 0.9375, + "step": 1828, + "think_completion_length": 44.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.125, + "epoch": 3.0893760539629005, + "grad_norm": 10.00986038855336, + "kl": 0.623046875, + "learning_rate": 3.831365935919055e-07, + "loss": 0.0006, + "reward": 3.3493294715881348, + "reward_std": 0.11154869198799133, + "rewards/final_reward": 1.6091478659368443, + "rewards/mask_iou_reward": 0.8045739329684222, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3493293523788452, + "rewards/thk_ans_format_reward": 1.0, + "step": 1829, + "think_completion_length": 39.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.734375, + "epoch": 3.09106239460371, + "grad_norm": 8.046581936185516, + "kl": 0.658203125, + "learning_rate": 3.8279932546374366e-07, + "loss": 0.0007, + "reward": 3.2317744493484497, + "reward_std": 0.11304565519094467, + "rewards/final_reward": 1.505587991593928, + "rewards/mask_iou_reward": 0.752793995796964, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2317743301391602, + "rewards/thk_ans_format_reward": 1.0, + "step": 1830, + "think_completion_length": 41.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.1875, + "epoch": 3.0927487352445193, + "grad_norm": 26.068413271225808, + "kl": 0.5859375, + "learning_rate": 3.824620573355818e-07, + "loss": 0.0006, + "reward": 3.3999075889587402, + "reward_std": 0.0778821213170886, + "rewards/final_reward": 1.7049617555467194, + "rewards/mask_iou_reward": 0.8524808777733597, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.399907648563385, + "rewards/thk_ans_format_reward": 1.0, + "step": 1831, + "think_completion_length": 44.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.140625, + "epoch": 3.094435075885329, + "grad_norm": 5.737550620244319, + "kl": 0.578125, + "learning_rate": 3.821247892074199e-07, + "loss": 0.0006, + "reward": 3.555485725402832, + "reward_std": 0.19017744529992342, + "rewards/final_reward": 1.50405068886782, + "rewards/mask_iou_reward": 0.75202534443391, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.555485486984253, + "rewards/thk_ans_format_reward": 1.0, + "step": 1832, + "think_completion_length": 46.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.796875, + "epoch": 3.096121416526138, + "grad_norm": 9.613502793709598, + "kl": 0.5234375, + "learning_rate": 3.8178752107925797e-07, + "loss": 0.0005, + "reward": 2.7982553243637085, + "reward_std": 0.2548002079129219, + "rewards/final_reward": 0.7696277788214507, + "rewards/mask_iou_reward": 0.38481388941072536, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7982552647590637, + "rewards/thk_ans_format_reward": 1.0, + "step": 1833, + "think_completion_length": 46.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.84375, + "epoch": 3.097807757166948, + "grad_norm": 8.490748952629394, + "kl": 0.61328125, + "learning_rate": 3.814502529510961e-07, + "loss": 0.0006, + "reward": 3.291377305984497, + "reward_std": 0.3434144649654627, + "rewards/final_reward": 1.1587527805696234, + "rewards/mask_iou_reward": 0.5793763902848117, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2913771867752075, + "rewards/thk_ans_format_reward": 1.0, + "step": 1834, + "think_completion_length": 47.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.90625, + "epoch": 3.099494097807757, + "grad_norm": 4.401463888525623, + "kl": 0.525390625, + "learning_rate": 3.811129848229342e-07, + "loss": 0.0005, + "reward": 3.6651761531829834, + "reward_std": 0.18362296093255281, + "rewards/final_reward": 1.6550658545344978, + "rewards/mask_iou_reward": 0.8275329272672489, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.665175974369049, + "rewards/thk_ans_format_reward": 1.0, + "step": 1835, + "think_completion_length": 46.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.15625, + "epoch": 3.1011804384485666, + "grad_norm": 4.078236651295097, + "kl": 0.5859375, + "learning_rate": 3.8077571669477234e-07, + "loss": 0.0006, + "reward": 3.1922521591186523, + "reward_std": 0.005031302338466048, + "rewards/final_reward": 1.475982413029449, + "rewards/mask_iou_reward": 0.7379912065147245, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1922521591186523, + "rewards/thk_ans_format_reward": 1.0, + "step": 1836, + "think_completion_length": 49.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.6875, + "epoch": 3.1028667790893762, + "grad_norm": 7.627962147529004, + "kl": 0.5595703125, + "learning_rate": 3.8043844856661043e-07, + "loss": 0.0006, + "reward": 3.6415493488311768, + "reward_std": 0.1579625979065895, + "rewards/final_reward": 1.9423843051787808, + "rewards/mask_iou_reward": 0.9711921525893904, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6415494084358215, + "rewards/thk_ans_format_reward": 1.0, + "step": 1837, + "think_completion_length": 45.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.625, + "epoch": 3.1045531197301854, + "grad_norm": 18.076689222268563, + "kl": 0.609375, + "learning_rate": 3.8010118043844857e-07, + "loss": 0.0006, + "reward": 3.604332685470581, + "reward_std": 0.03623810596764088, + "rewards/final_reward": 1.356653644344096, + "rewards/mask_iou_reward": 0.678326822172048, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6043327450752258, + "rewards/thk_ans_format_reward": 1.0, + "step": 1838, + "think_completion_length": 50.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.25, + "epoch": 3.106239460370995, + "grad_norm": 60.27899415767619, + "kl": 0.697265625, + "learning_rate": 3.7976391231028665e-07, + "loss": 0.0007, + "reward": 3.1235082149505615, + "reward_std": 0.017116380273364484, + "rewards/final_reward": 0.9495803673395722, + "rewards/mask_iou_reward": 0.4747901836697861, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1235082745552063, + "rewards/thk_ans_format_reward": 1.0, + "step": 1839, + "think_completion_length": 46.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 316.875, + "epoch": 3.1079258010118043, + "grad_norm": 10.828374197849795, + "kl": 0.4296875, + "learning_rate": 3.794266441821248e-07, + "loss": 0.0004, + "reward": 3.080757737159729, + "reward_std": 0.42562781274318695, + "rewards/final_reward": 1.7166104285447097, + "rewards/mask_iou_reward": 0.8583052142723548, + "rewards/sam_format_reward": 0.875, + "rewards/sam_reward_func_ultra": 1.3307577967643738, + "rewards/thk_ans_format_reward": 0.875, + "step": 1840, + "think_completion_length": 45.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.671875, + "epoch": 3.109612141652614, + "grad_norm": 29.597362909569423, + "kl": 0.7265625, + "learning_rate": 3.790893760539629e-07, + "loss": 0.0005, + "reward": 3.8099430799484253, + "reward_std": 0.01841105322819203, + "rewards/final_reward": 1.9609085731482856, + "rewards/mask_iou_reward": 0.9804542865741428, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8099430799484253, + "rewards/thk_ans_format_reward": 1.0, + "step": 1841, + "think_completion_length": 48.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.0, + "epoch": 3.111298482293423, + "grad_norm": 6.143032352992298, + "kl": 0.64453125, + "learning_rate": 3.7875210792580097e-07, + "loss": 0.0007, + "reward": 3.4265583753585815, + "reward_std": 0.020277044735848904, + "rewards/final_reward": 1.4126127862405058, + "rewards/mask_iou_reward": 0.7063063931202529, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4265583157539368, + "rewards/thk_ans_format_reward": 1.0, + "step": 1842, + "think_completion_length": 51.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.78125, + "epoch": 3.1129848229342327, + "grad_norm": 9.920229412362959, + "kl": 0.521484375, + "learning_rate": 3.784148397976391e-07, + "loss": 0.0005, + "reward": 3.4824771881103516, + "reward_std": 0.18812450766563416, + "rewards/final_reward": 1.4282994211107694, + "rewards/mask_iou_reward": 0.7141497105553847, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4824773669242859, + "rewards/thk_ans_format_reward": 1.0, + "step": 1843, + "think_completion_length": 45.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.125, + "epoch": 3.1146711635750424, + "grad_norm": 5.468730327469074, + "kl": 0.568359375, + "learning_rate": 3.7807757166947725e-07, + "loss": 0.0006, + "reward": 3.3906657695770264, + "reward_std": 0.44338157773017883, + "rewards/final_reward": 1.1189098370969175, + "rewards/mask_iou_reward": 0.5594549185484587, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3906657695770264, + "rewards/thk_ans_format_reward": 1.0, + "step": 1844, + "think_completion_length": 40.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.59375, + "epoch": 3.1163575042158516, + "grad_norm": 4.671244035097516, + "kl": 0.587890625, + "learning_rate": 3.777403035413153e-07, + "loss": 0.0006, + "reward": 3.467657446861267, + "reward_std": 0.14314634166657925, + "rewards/final_reward": 1.4949584161797376, + "rewards/mask_iou_reward": 0.7474792080898688, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4676575064659119, + "rewards/thk_ans_format_reward": 1.0, + "step": 1845, + "think_completion_length": 46.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.984375, + "epoch": 3.118043844856661, + "grad_norm": 9.86402742741037, + "kl": 0.58203125, + "learning_rate": 3.774030354131534e-07, + "loss": 0.0006, + "reward": 3.4636902809143066, + "reward_std": 0.39987847208976746, + "rewards/final_reward": 1.5898304441749473, + "rewards/mask_iou_reward": 0.7949152220874737, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4636903405189514, + "rewards/thk_ans_format_reward": 1.0, + "step": 1846, + "think_completion_length": 46.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.84375, + "epoch": 3.1197301854974704, + "grad_norm": 9.527701121732772, + "kl": 0.642578125, + "learning_rate": 3.7706576728499157e-07, + "loss": 0.0006, + "reward": 2.6779627799987793, + "reward_std": 0.21313096582889557, + "rewards/final_reward": 0.5526819332576769, + "rewards/mask_iou_reward": 0.27634096662883845, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6779626905918121, + "rewards/thk_ans_format_reward": 1.0, + "step": 1847, + "think_completion_length": 44.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.3125, + "epoch": 3.12141652613828, + "grad_norm": 17.793087006055625, + "kl": 0.5703125, + "learning_rate": 3.7672849915682965e-07, + "loss": 0.0006, + "reward": 3.570667028427124, + "reward_std": 0.1760418675839901, + "rewards/final_reward": 1.8226773166599215, + "rewards/mask_iou_reward": 0.9113386583299607, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5706669092178345, + "rewards/thk_ans_format_reward": 1.0, + "step": 1848, + "think_completion_length": 51.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.234375, + "epoch": 3.123102866779089, + "grad_norm": 6.947591571261172, + "kl": 0.58984375, + "learning_rate": 3.7639123102866774e-07, + "loss": 0.0006, + "reward": 3.3724918365478516, + "reward_std": 0.1368257123976946, + "rewards/final_reward": 1.3695020442669015, + "rewards/mask_iou_reward": 0.6847510221334507, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3724918365478516, + "rewards/thk_ans_format_reward": 1.0, + "step": 1849, + "think_completion_length": 47.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.984375, + "epoch": 3.124789207419899, + "grad_norm": 9.58505081019304, + "kl": 0.7265625, + "learning_rate": 3.760539629005059e-07, + "loss": 0.0007, + "reward": 3.7754982709884644, + "reward_std": 0.03654424054548144, + "rewards/final_reward": 1.6380196415780146, + "rewards/mask_iou_reward": 0.8190098207890073, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7754983305931091, + "rewards/thk_ans_format_reward": 1.0, + "step": 1850, + "think_completion_length": 40.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.5625, + "epoch": 3.126475548060708, + "grad_norm": 5.781252685852134, + "kl": 0.5390625, + "learning_rate": 3.75716694772344e-07, + "loss": 0.0005, + "reward": 3.590414047241211, + "reward_std": 0.09510018303990364, + "rewards/final_reward": 1.7070859122080297, + "rewards/mask_iou_reward": 0.8535429561040149, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.590414047241211, + "rewards/thk_ans_format_reward": 1.0, + "step": 1851, + "think_completion_length": 42.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.421875, + "epoch": 3.1281618887015177, + "grad_norm": 11.536011446857135, + "kl": 0.5126953125, + "learning_rate": 3.753794266441821e-07, + "loss": 0.0005, + "reward": 3.3813726902008057, + "reward_std": 0.19809474796056747, + "rewards/final_reward": 0.9778288256741281, + "rewards/mask_iou_reward": 0.48891441283706405, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3813725113868713, + "rewards/thk_ans_format_reward": 1.0, + "step": 1852, + "think_completion_length": 39.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.390625, + "epoch": 3.1298482293423273, + "grad_norm": 10.820383708545043, + "kl": 0.62109375, + "learning_rate": 3.7504215851602025e-07, + "loss": 0.0006, + "reward": 3.6570252180099487, + "reward_std": 0.21269061416387558, + "rewards/final_reward": 1.6539371430881022, + "rewards/mask_iou_reward": 0.8269685715440511, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6570250988006592, + "rewards/thk_ans_format_reward": 1.0, + "step": 1853, + "think_completion_length": 48.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.109375, + "epoch": 3.1315345699831365, + "grad_norm": 170.8577386926615, + "kl": 0.705078125, + "learning_rate": 3.7470489038785834e-07, + "loss": 0.0007, + "reward": 3.6504067182540894, + "reward_std": 0.07961778342723846, + "rewards/final_reward": 1.6228600793158108, + "rewards/mask_iou_reward": 0.8114300396579054, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6504066586494446, + "rewards/thk_ans_format_reward": 1.0, + "step": 1854, + "think_completion_length": 47.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.484375, + "epoch": 3.133220910623946, + "grad_norm": 20.313002252551094, + "kl": 0.580078125, + "learning_rate": 3.743676222596964e-07, + "loss": 0.0006, + "reward": 3.3023815155029297, + "reward_std": 0.1649649254977703, + "rewards/final_reward": 1.058155867642932, + "rewards/mask_iou_reward": 0.529077933821466, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3023815155029297, + "rewards/thk_ans_format_reward": 1.0, + "step": 1855, + "think_completion_length": 46.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.25, + "epoch": 3.1349072512647553, + "grad_norm": 5.257598384804945, + "kl": 0.4658203125, + "learning_rate": 3.7403035413153456e-07, + "loss": 0.0005, + "reward": 3.3379251956939697, + "reward_std": 0.20651425421237946, + "rewards/final_reward": 1.2870029569744763, + "rewards/mask_iou_reward": 0.6435014784872382, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.337925374507904, + "rewards/thk_ans_format_reward": 1.0, + "step": 1856, + "think_completion_length": 45.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.46875, + "epoch": 3.136593591905565, + "grad_norm": 10.768937765332302, + "kl": 0.474609375, + "learning_rate": 3.736930860033727e-07, + "loss": 0.0005, + "reward": 3.6509578227996826, + "reward_std": 0.31961746513843536, + "rewards/final_reward": 1.6596256594304695, + "rewards/mask_iou_reward": 0.8298128297152347, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.6665828227996826, + "rewards/thk_ans_format_reward": 1.0, + "step": 1857, + "think_completion_length": 46.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.6875, + "epoch": 3.138279932546374, + "grad_norm": 9.965618795679344, + "kl": 0.544921875, + "learning_rate": 3.7335581787521074e-07, + "loss": 0.0005, + "reward": 3.0416864156723022, + "reward_std": 0.19793753325939178, + "rewards/final_reward": 1.1762378400742732, + "rewards/mask_iou_reward": 0.5881189200371366, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0416864231228828, + "rewards/thk_ans_format_reward": 1.0, + "step": 1858, + "think_completion_length": 47.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.6875, + "epoch": 3.139966273187184, + "grad_norm": 8.233677040831926, + "kl": 0.5703125, + "learning_rate": 3.730185497470489e-07, + "loss": 0.0006, + "reward": 3.6706053018569946, + "reward_std": 0.10431353002786636, + "rewards/final_reward": 1.8973720981450843, + "rewards/mask_iou_reward": 0.9486860490725422, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6706053018569946, + "rewards/thk_ans_format_reward": 1.0, + "step": 1859, + "think_completion_length": 41.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.828125, + "epoch": 3.1416526138279934, + "grad_norm": 12.952010142511586, + "kl": 0.66796875, + "learning_rate": 3.72681281618887e-07, + "loss": 0.0007, + "reward": 3.4579014778137207, + "reward_std": 0.23089369386434555, + "rewards/final_reward": 1.597618762950591, + "rewards/mask_iou_reward": 0.7988093814752955, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4579015374183655, + "rewards/thk_ans_format_reward": 1.0, + "step": 1860, + "think_completion_length": 45.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.640625, + "epoch": 3.1433389544688026, + "grad_norm": 9.221455733853075, + "kl": 0.583984375, + "learning_rate": 3.723440134907251e-07, + "loss": 0.0006, + "reward": 3.3412728309631348, + "reward_std": 0.387121319770813, + "rewards/final_reward": 1.2192848265220455, + "rewards/mask_iou_reward": 0.6096424132610228, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.3725228309631348, + "rewards/thk_ans_format_reward": 0.984375, + "step": 1861, + "think_completion_length": 43.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.859375, + "epoch": 3.1450252951096123, + "grad_norm": 10.246420327009988, + "kl": 0.5166015625, + "learning_rate": 3.720067453625632e-07, + "loss": 0.0005, + "reward": 3.467332124710083, + "reward_std": 0.11898842453956604, + "rewards/final_reward": 1.3554770121678938, + "rewards/mask_iou_reward": 0.6777385060839469, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4673320055007935, + "rewards/thk_ans_format_reward": 1.0, + "step": 1862, + "think_completion_length": 42.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.703125, + "epoch": 3.1467116357504215, + "grad_norm": 4.853357004136349, + "kl": 0.54296875, + "learning_rate": 3.7166947723440133e-07, + "loss": 0.0006, + "reward": 3.306196928024292, + "reward_std": 0.26819103956222534, + "rewards/final_reward": 1.0140351818526023, + "rewards/mask_iou_reward": 0.5070175909263012, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3061969578266144, + "rewards/thk_ans_format_reward": 1.0, + "step": 1863, + "think_completion_length": 47.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.6875, + "epoch": 3.148397976391231, + "grad_norm": 15.97786230847862, + "kl": 0.814453125, + "learning_rate": 3.713322091062395e-07, + "loss": 0.0008, + "reward": 3.4921233654022217, + "reward_std": 0.1628289446234703, + "rewards/final_reward": 1.8660033928082365, + "rewards/mask_iou_reward": 0.9330016964041182, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4921232461929321, + "rewards/thk_ans_format_reward": 1.0, + "step": 1864, + "think_completion_length": 46.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.28125, + "epoch": 3.1500843170320403, + "grad_norm": 8.047713030230051, + "kl": 0.609375, + "learning_rate": 3.7099494097807756e-07, + "loss": 0.0006, + "reward": 3.2138736248016357, + "reward_std": 0.040225003845989704, + "rewards/final_reward": 0.9726650117579593, + "rewards/mask_iou_reward": 0.48633250587897964, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2138735353946686, + "rewards/thk_ans_format_reward": 1.0, + "step": 1865, + "think_completion_length": 43.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.09375, + "epoch": 3.15177065767285, + "grad_norm": 16.52763285766392, + "kl": 0.62109375, + "learning_rate": 3.7065767284991565e-07, + "loss": 0.0006, + "reward": 3.612211227416992, + "reward_std": 0.21964190807193518, + "rewards/final_reward": 1.6254546480444538, + "rewards/mask_iou_reward": 0.8127273240222269, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6122112274169922, + "rewards/thk_ans_format_reward": 1.0, + "step": 1866, + "think_completion_length": 43.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.71875, + "epoch": 3.1534569983136596, + "grad_norm": 4.863586737187257, + "kl": 0.91015625, + "learning_rate": 3.703204047217538e-07, + "loss": 0.0009, + "reward": 3.467667579650879, + "reward_std": 0.1343383565545082, + "rewards/final_reward": 1.3809162033320566, + "rewards/mask_iou_reward": 0.6904581016660283, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.467667579650879, + "rewards/thk_ans_format_reward": 1.0, + "step": 1867, + "think_completion_length": 42.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.46875, + "epoch": 3.1551433389544687, + "grad_norm": 8.785152209092196, + "kl": 0.53515625, + "learning_rate": 3.699831365935919e-07, + "loss": 0.0005, + "reward": 3.3410245180130005, + "reward_std": 0.19761168956756592, + "rewards/final_reward": 1.7634354339104432, + "rewards/mask_iou_reward": 0.8817177169552216, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3410245180130005, + "rewards/thk_ans_format_reward": 1.0, + "step": 1868, + "think_completion_length": 40.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.5625, + "epoch": 3.1568296795952784, + "grad_norm": 19.595912000538206, + "kl": 0.587890625, + "learning_rate": 3.6964586846543e-07, + "loss": 0.0006, + "reward": 2.9621787071228027, + "reward_std": 0.1222074730321765, + "rewards/final_reward": 0.841697455205978, + "rewards/mask_iou_reward": 0.420848727602989, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9621787965297699, + "rewards/thk_ans_format_reward": 1.0, + "step": 1869, + "think_completion_length": 49.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.859375, + "epoch": 3.1585160202360876, + "grad_norm": 9.419372854331563, + "kl": 0.578125, + "learning_rate": 3.693086003372681e-07, + "loss": 0.0006, + "reward": 3.422818660736084, + "reward_std": 0.2237471342086792, + "rewards/final_reward": 1.283719321853805, + "rewards/mask_iou_reward": 0.6418596609269025, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4228186011314392, + "rewards/thk_ans_format_reward": 1.0, + "step": 1870, + "think_completion_length": 48.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.4375, + "epoch": 3.160202360876897, + "grad_norm": 7.825248544089704, + "kl": 0.86328125, + "learning_rate": 3.689713322091062e-07, + "loss": 0.0009, + "reward": 3.4461352825164795, + "reward_std": 0.1912602037191391, + "rewards/final_reward": 1.5204806993205606, + "rewards/mask_iou_reward": 0.7602403496602803, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.44613516330719, + "rewards/thk_ans_format_reward": 1.0, + "step": 1871, + "think_completion_length": 49.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.90625, + "epoch": 3.1618887015177064, + "grad_norm": 6.866246401018885, + "kl": 0.5703125, + "learning_rate": 3.6863406408094433e-07, + "loss": 0.0006, + "reward": 2.7474160194396973, + "reward_std": 0.17037902772426605, + "rewards/final_reward": 0.9670391487885153, + "rewards/mask_iou_reward": 0.48351957439425763, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7474161013960838, + "rewards/thk_ans_format_reward": 1.0, + "step": 1872, + "think_completion_length": 43.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.84375, + "epoch": 3.163575042158516, + "grad_norm": 14.437614062731507, + "kl": 0.552734375, + "learning_rate": 3.6829679595278247e-07, + "loss": 0.0005, + "reward": 3.483241558074951, + "reward_std": 0.3002520129084587, + "rewards/final_reward": 1.7079844480126627, + "rewards/mask_iou_reward": 0.8539922240063313, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.483241617679596, + "rewards/thk_ans_format_reward": 1.0, + "step": 1873, + "think_completion_length": 49.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.53125, + "epoch": 3.1652613827993257, + "grad_norm": 7.261877769797301, + "kl": 0.6015625, + "learning_rate": 3.679595278246205e-07, + "loss": 0.0006, + "reward": 3.2580798864364624, + "reward_std": 0.11684287153184414, + "rewards/final_reward": 1.677791107797584, + "rewards/mask_iou_reward": 0.838895553898792, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2580798864364624, + "rewards/thk_ans_format_reward": 1.0, + "step": 1874, + "think_completion_length": 43.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.859375, + "epoch": 3.166947723440135, + "grad_norm": 6.984406291862621, + "kl": 0.666015625, + "learning_rate": 3.6762225969645865e-07, + "loss": 0.0007, + "reward": 3.373793601989746, + "reward_std": 0.1073999097570777, + "rewards/final_reward": 1.5053395216827492, + "rewards/mask_iou_reward": 0.7526697608413746, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3737936615943909, + "rewards/thk_ans_format_reward": 1.0, + "step": 1875, + "think_completion_length": 43.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.265625, + "epoch": 3.1686340640809445, + "grad_norm": 12.918770229866686, + "kl": 0.5078125, + "learning_rate": 3.672849915682968e-07, + "loss": 0.0005, + "reward": 2.6261537075042725, + "reward_std": 0.35085177421569824, + "rewards/final_reward": 0.20398478661733804, + "rewards/mask_iou_reward": 0.10199239330866902, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6261536031961441, + "rewards/thk_ans_format_reward": 1.0, + "step": 1876, + "think_completion_length": 46.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.390625, + "epoch": 3.1703204047217537, + "grad_norm": 7.044151304726128, + "kl": 0.5234375, + "learning_rate": 3.6694772344013493e-07, + "loss": 0.0005, + "reward": 3.3403379917144775, + "reward_std": 0.09493143483996391, + "rewards/final_reward": 1.0117943134491862, + "rewards/mask_iou_reward": 0.5058971567245931, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3403378129005432, + "rewards/thk_ans_format_reward": 1.0, + "step": 1877, + "think_completion_length": 44.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.21875, + "epoch": 3.1720067453625633, + "grad_norm": 7.004275452012231, + "kl": 0.5185546875, + "learning_rate": 3.6661045531197296e-07, + "loss": 0.0004, + "reward": 3.5764840841293335, + "reward_std": 0.18109191954135895, + "rewards/final_reward": 1.7178301548804966, + "rewards/mask_iou_reward": 0.8589150774402483, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5764840841293335, + "rewards/thk_ans_format_reward": 1.0, + "step": 1878, + "think_completion_length": 49.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.4375, + "epoch": 3.1736930860033725, + "grad_norm": 7.125834124063926, + "kl": 0.609375, + "learning_rate": 3.662731871838111e-07, + "loss": 0.0006, + "reward": 3.4124138355255127, + "reward_std": 0.24705388210713863, + "rewards/final_reward": 0.9846058681887829, + "rewards/mask_iou_reward": 0.49230293409439146, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4124139547348022, + "rewards/thk_ans_format_reward": 1.0, + "step": 1879, + "think_completion_length": 49.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.84375, + "epoch": 3.175379426644182, + "grad_norm": 9.54853842110885, + "kl": 0.69140625, + "learning_rate": 3.6593591905564924e-07, + "loss": 0.0007, + "reward": 3.36881947517395, + "reward_std": 0.2934834212064743, + "rewards/final_reward": 1.6325663245336783, + "rewards/mask_iou_reward": 0.8162831622668392, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3688194751739502, + "rewards/thk_ans_format_reward": 1.0, + "step": 1880, + "think_completion_length": 41.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.40625, + "epoch": 3.177065767284992, + "grad_norm": 16.811266466139205, + "kl": 0.58203125, + "learning_rate": 3.6559865092748733e-07, + "loss": 0.0006, + "reward": 3.3678882122039795, + "reward_std": 0.1498733222251758, + "rewards/final_reward": 1.533214750984277, + "rewards/mask_iou_reward": 0.7666073754921385, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.367888331413269, + "rewards/thk_ans_format_reward": 1.0, + "step": 1881, + "think_completion_length": 48.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.203125, + "epoch": 3.178752107925801, + "grad_norm": 20.32912750334905, + "kl": 0.609375, + "learning_rate": 3.6526138279932547e-07, + "loss": 0.0006, + "reward": 3.725816488265991, + "reward_std": 0.07194982096552849, + "rewards/final_reward": 1.869411332281031, + "rewards/mask_iou_reward": 0.9347056661405155, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7258164882659912, + "rewards/thk_ans_format_reward": 1.0, + "step": 1882, + "think_completion_length": 42.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.078125, + "epoch": 3.1804384485666106, + "grad_norm": 13.022071736423868, + "kl": 0.560546875, + "learning_rate": 3.6492411467116356e-07, + "loss": 0.0006, + "reward": 3.3680756092071533, + "reward_std": 0.164290108717978, + "rewards/final_reward": 1.1868072526412312, + "rewards/mask_iou_reward": 0.5934036263206156, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.368075668811798, + "rewards/thk_ans_format_reward": 1.0, + "step": 1883, + "think_completion_length": 47.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.390625, + "epoch": 3.18212478920742, + "grad_norm": 17.883421271785252, + "kl": 0.56640625, + "learning_rate": 3.6458684654300165e-07, + "loss": 0.0006, + "reward": 3.207452654838562, + "reward_std": 0.4637444317340851, + "rewards/final_reward": 1.2528438943904214, + "rewards/mask_iou_reward": 0.6264219471952107, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2074525952339172, + "rewards/thk_ans_format_reward": 1.0, + "step": 1884, + "think_completion_length": 51.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.796875, + "epoch": 3.1838111298482294, + "grad_norm": 15.091080800093014, + "kl": 0.578125, + "learning_rate": 3.642495784148398e-07, + "loss": 0.0006, + "reward": 2.940601348876953, + "reward_std": 0.13293109834194183, + "rewards/final_reward": 0.568362582486542, + "rewards/mask_iou_reward": 0.284181291243271, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9406013190746307, + "rewards/thk_ans_format_reward": 1.0, + "step": 1885, + "think_completion_length": 49.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.46875, + "epoch": 3.1854974704890386, + "grad_norm": 7.057374034683764, + "kl": 0.541015625, + "learning_rate": 3.6391231028667793e-07, + "loss": 0.0005, + "reward": 3.1076853275299072, + "reward_std": 0.07365784235298634, + "rewards/final_reward": 1.326237704850417, + "rewards/mask_iou_reward": 0.6631188524252085, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1076853275299072, + "rewards/thk_ans_format_reward": 1.0, + "step": 1886, + "think_completion_length": 45.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.125, + "epoch": 3.1871838111298483, + "grad_norm": 10.782989362018533, + "kl": 0.5703125, + "learning_rate": 3.6357504215851596e-07, + "loss": 0.0006, + "reward": 3.658339738845825, + "reward_std": 0.015222079586237669, + "rewards/final_reward": 1.7016695886711926, + "rewards/mask_iou_reward": 0.8508347943355963, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6583398580551147, + "rewards/thk_ans_format_reward": 1.0, + "step": 1887, + "think_completion_length": 47.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.734375, + "epoch": 3.1888701517706575, + "grad_norm": 16.208059848588775, + "kl": 0.884765625, + "learning_rate": 3.632377740303541e-07, + "loss": 0.0009, + "reward": 3.001835823059082, + "reward_std": 0.17691810801625252, + "rewards/final_reward": 0.9908187527331923, + "rewards/mask_iou_reward": 0.49540937636659615, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0018357634544373, + "rewards/thk_ans_format_reward": 1.0, + "step": 1888, + "think_completion_length": 46.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.8125, + "epoch": 3.190556492411467, + "grad_norm": 8.666552629160558, + "kl": 0.544921875, + "learning_rate": 3.6290050590219224e-07, + "loss": 0.0005, + "reward": 3.2973833084106445, + "reward_std": 0.20763015747070312, + "rewards/final_reward": 1.6951685637802774, + "rewards/mask_iou_reward": 0.8475842818901387, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2973833084106445, + "rewards/thk_ans_format_reward": 1.0, + "step": 1889, + "think_completion_length": 50.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.125, + "epoch": 3.1922428330522767, + "grad_norm": 8.026625405899852, + "kl": 0.5546875, + "learning_rate": 3.625632377740304e-07, + "loss": 0.0006, + "reward": 3.849402904510498, + "reward_std": 0.04095839988440275, + "rewards/final_reward": 1.8241999084811233, + "rewards/mask_iou_reward": 0.9120999542405617, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.849402904510498, + "rewards/thk_ans_format_reward": 1.0, + "step": 1890, + "think_completion_length": 46.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.34375, + "epoch": 3.193929173693086, + "grad_norm": 12.34817957993866, + "kl": 0.6015625, + "learning_rate": 3.622259696458684e-07, + "loss": 0.0006, + "reward": 3.621413826942444, + "reward_std": 0.04138875612989068, + "rewards/final_reward": 1.8486262133262046, + "rewards/mask_iou_reward": 0.9243131066631023, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6214138269424438, + "rewards/thk_ans_format_reward": 1.0, + "step": 1891, + "think_completion_length": 47.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.765625, + "epoch": 3.1956155143338956, + "grad_norm": 23.116554236485523, + "kl": 0.572265625, + "learning_rate": 3.6188870151770656e-07, + "loss": 0.0006, + "reward": 3.0774978399276733, + "reward_std": 0.01694110711105168, + "rewards/final_reward": 0.7942402422055221, + "rewards/mask_iou_reward": 0.39712012110276107, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0774978697299957, + "rewards/thk_ans_format_reward": 1.0, + "step": 1892, + "think_completion_length": 46.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.921875, + "epoch": 3.1973018549747048, + "grad_norm": 46.81499159655645, + "kl": 0.625, + "learning_rate": 3.615514333895447e-07, + "loss": 0.0006, + "reward": 3.678188443183899, + "reward_std": 0.11049404554069042, + "rewards/final_reward": 1.4534138323408698, + "rewards/mask_iou_reward": 0.7267069161704349, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6781885027885437, + "rewards/thk_ans_format_reward": 1.0, + "step": 1893, + "think_completion_length": 40.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.65625, + "epoch": 3.1989881956155144, + "grad_norm": 7.208762040832486, + "kl": 0.650390625, + "learning_rate": 3.612141652613828e-07, + "loss": 0.0007, + "reward": 3.148299217224121, + "reward_std": 0.14013096690177917, + "rewards/final_reward": 1.7720664262952694, + "rewards/mask_iou_reward": 0.8860332131476347, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1482991874217987, + "rewards/thk_ans_format_reward": 1.0, + "step": 1894, + "think_completion_length": 51.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.59375, + "epoch": 3.2006745362563236, + "grad_norm": 20.349059929554834, + "kl": 0.701171875, + "learning_rate": 3.6087689713322087e-07, + "loss": 0.0007, + "reward": 3.4096421003341675, + "reward_std": 0.020920042879879475, + "rewards/final_reward": 1.6369392328477477, + "rewards/mask_iou_reward": 0.8184696164238738, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4096421599388123, + "rewards/thk_ans_format_reward": 1.0, + "step": 1895, + "think_completion_length": 51.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.734375, + "epoch": 3.2023608768971332, + "grad_norm": 12.582295871610476, + "kl": 0.4169921875, + "learning_rate": 3.60539629005059e-07, + "loss": 0.0004, + "reward": 2.980480432510376, + "reward_std": 0.2008163258433342, + "rewards/final_reward": 1.269321376141052, + "rewards/mask_iou_reward": 0.634660688070526, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.0117304623126984, + "rewards/thk_ans_format_reward": 0.984375, + "step": 1896, + "think_completion_length": 52.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.765625, + "epoch": 3.204047217537943, + "grad_norm": 9.450852600185378, + "kl": 0.5546875, + "learning_rate": 3.602023608768971e-07, + "loss": 0.0005, + "reward": 3.686271071434021, + "reward_std": 0.06225780211389065, + "rewards/final_reward": 1.4150984738944112, + "rewards/mask_iou_reward": 0.7075492369472056, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6862711906433105, + "rewards/thk_ans_format_reward": 1.0, + "step": 1897, + "think_completion_length": 49.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.140625, + "epoch": 3.205733558178752, + "grad_norm": 24.499457257777063, + "kl": 0.74609375, + "learning_rate": 3.5986509274873524e-07, + "loss": 0.0008, + "reward": 3.770930767059326, + "reward_std": 0.036563062109053135, + "rewards/final_reward": 1.723879682408082, + "rewards/mask_iou_reward": 0.861939841204041, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.770930826663971, + "rewards/thk_ans_format_reward": 1.0, + "step": 1898, + "think_completion_length": 46.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 230.359375, + "epoch": 3.2074198988195617, + "grad_norm": 25.75726985246122, + "kl": 0.50390625, + "learning_rate": 3.5952782462057333e-07, + "loss": 0.0005, + "reward": 3.2525272369384766, + "reward_std": 0.3233413156121969, + "rewards/final_reward": 1.0621304009366679, + "rewards/mask_iou_reward": 0.5310652004683339, + "rewards/sam_format_reward": 0.921875, + "rewards/sam_reward_func_ultra": 1.4087771773338318, + "rewards/thk_ans_format_reward": 0.921875, + "step": 1899, + "think_completion_length": 47.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.859375, + "epoch": 3.209106239460371, + "grad_norm": 7.243676204871831, + "kl": 0.525390625, + "learning_rate": 3.5919055649241147e-07, + "loss": 0.0005, + "reward": 3.7162941694259644, + "reward_std": 0.11420441046357155, + "rewards/final_reward": 1.7875524891500199, + "rewards/mask_iou_reward": 0.8937762445750099, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7162942290306091, + "rewards/thk_ans_format_reward": 1.0, + "step": 1900, + "think_completion_length": 48.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.4375, + "epoch": 3.2107925801011805, + "grad_norm": 15.670700195164919, + "kl": 0.64453125, + "learning_rate": 3.5885328836424955e-07, + "loss": 0.0006, + "reward": 2.7376617193222046, + "reward_std": 0.0967277530580759, + "rewards/final_reward": 0.6101376436250409, + "rewards/mask_iou_reward": 0.30506882181252043, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7376617193222046, + "rewards/thk_ans_format_reward": 1.0, + "step": 1901, + "think_completion_length": 42.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.203125, + "epoch": 3.2124789207419897, + "grad_norm": 9.917814521267253, + "kl": 0.828125, + "learning_rate": 3.585160202360877e-07, + "loss": 0.0008, + "reward": 3.349041223526001, + "reward_std": 0.11464390531182289, + "rewards/final_reward": 1.6008897912950604, + "rewards/mask_iou_reward": 0.8004448956475302, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.349041372537613, + "rewards/thk_ans_format_reward": 1.0, + "step": 1902, + "think_completion_length": 47.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.0625, + "epoch": 3.2141652613827993, + "grad_norm": 13.716296122192867, + "kl": 0.482421875, + "learning_rate": 3.581787521079258e-07, + "loss": 0.0005, + "reward": 3.7791026830673218, + "reward_std": 0.16377420909702778, + "rewards/final_reward": 1.7674922340875896, + "rewards/mask_iou_reward": 0.8837461170437948, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7791024446487427, + "rewards/thk_ans_format_reward": 1.0, + "step": 1903, + "think_completion_length": 47.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.265625, + "epoch": 3.2158516020236085, + "grad_norm": 6.917382991288502, + "kl": 0.4609375, + "learning_rate": 3.5784148397976387e-07, + "loss": 0.0005, + "reward": 3.2332409620285034, + "reward_std": 0.11346263438463211, + "rewards/final_reward": 1.6959626416145304, + "rewards/mask_iou_reward": 0.8479813208072652, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2332409620285034, + "rewards/thk_ans_format_reward": 1.0, + "step": 1904, + "think_completion_length": 39.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.40625, + "epoch": 3.217537942664418, + "grad_norm": 6.471546535689695, + "kl": 0.4716796875, + "learning_rate": 3.57504215851602e-07, + "loss": 0.0005, + "reward": 3.0267739295959473, + "reward_std": 0.02534060279140249, + "rewards/final_reward": 1.7786319183385815, + "rewards/mask_iou_reward": 0.8893159591692907, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.026773989200592, + "rewards/thk_ans_format_reward": 1.0, + "step": 1905, + "think_completion_length": 45.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.953125, + "epoch": 3.219224283305228, + "grad_norm": 10.204264759824875, + "kl": 0.5068359375, + "learning_rate": 3.5716694772344015e-07, + "loss": 0.0005, + "reward": 2.850593686103821, + "reward_std": 0.19878476485610008, + "rewards/final_reward": 1.4631749075644072, + "rewards/mask_iou_reward": 0.7315874537822036, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8505937159061432, + "rewards/thk_ans_format_reward": 1.0, + "step": 1906, + "think_completion_length": 48.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.6875, + "epoch": 3.220910623946037, + "grad_norm": 4.331052028511863, + "kl": 0.60546875, + "learning_rate": 3.568296795952782e-07, + "loss": 0.0006, + "reward": 2.796687960624695, + "reward_std": 0.14800949243362993, + "rewards/final_reward": 0.0, + "rewards/mask_iou_reward": 0.0, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7966879308223724, + "rewards/thk_ans_format_reward": 1.0, + "step": 1907, + "think_completion_length": 53.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.5625, + "epoch": 3.2225969645868466, + "grad_norm": 12.014305801190648, + "kl": 0.5625, + "learning_rate": 3.564924114671163e-07, + "loss": 0.0006, + "reward": 3.552351474761963, + "reward_std": 0.1006348617374897, + "rewards/final_reward": 1.178661954997034, + "rewards/mask_iou_reward": 0.589330977498517, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.552351474761963, + "rewards/thk_ans_format_reward": 1.0, + "step": 1908, + "think_completion_length": 45.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.8125, + "epoch": 3.224283305227656, + "grad_norm": 21.494166836430573, + "kl": 0.5, + "learning_rate": 3.5615514333895447e-07, + "loss": 0.0005, + "reward": 3.580656051635742, + "reward_std": 0.1924985572695732, + "rewards/final_reward": 1.5555895783528935, + "rewards/mask_iou_reward": 0.7777947891764467, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.580656111240387, + "rewards/thk_ans_format_reward": 1.0, + "step": 1909, + "think_completion_length": 41.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.109375, + "epoch": 3.2259696458684655, + "grad_norm": 9.042788517181418, + "kl": 0.55859375, + "learning_rate": 3.5581787521079255e-07, + "loss": 0.0006, + "reward": 3.6765564680099487, + "reward_std": 0.15247973427176476, + "rewards/final_reward": 1.6539351637884079, + "rewards/mask_iou_reward": 0.8269675818942039, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6765565276145935, + "rewards/thk_ans_format_reward": 1.0, + "step": 1910, + "think_completion_length": 44.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.03125, + "epoch": 3.2276559865092747, + "grad_norm": 30.946422055169762, + "kl": 0.69140625, + "learning_rate": 3.554806070826307e-07, + "loss": 0.0007, + "reward": 3.311057448387146, + "reward_std": 0.05010443180799484, + "rewards/final_reward": 1.2626555786327738, + "rewards/mask_iou_reward": 0.6313277893163869, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3110575675964355, + "rewards/thk_ans_format_reward": 1.0, + "step": 1911, + "think_completion_length": 46.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.546875, + "epoch": 3.2293423271500843, + "grad_norm": 15.95198649741166, + "kl": 0.572265625, + "learning_rate": 3.551433389544688e-07, + "loss": 0.0006, + "reward": 3.87019944190979, + "reward_std": 0.008461029967293143, + "rewards/final_reward": 1.8844051330535168, + "rewards/mask_iou_reward": 0.9422025665267584, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8701993823051453, + "rewards/thk_ans_format_reward": 1.0, + "step": 1912, + "think_completion_length": 49.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.3125, + "epoch": 3.231028667790894, + "grad_norm": 71.42783539007475, + "kl": 0.587890625, + "learning_rate": 3.548060708263069e-07, + "loss": 0.0006, + "reward": 3.5393285751342773, + "reward_std": 0.0627220245078206, + "rewards/final_reward": 1.3345203832878214, + "rewards/mask_iou_reward": 0.6672601916439107, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5393285751342773, + "rewards/thk_ans_format_reward": 1.0, + "step": 1913, + "think_completion_length": 42.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 227.25, + "epoch": 3.232715008431703, + "grad_norm": 12.793804181066932, + "kl": 0.498046875, + "learning_rate": 3.54468802698145e-07, + "loss": 0.0005, + "reward": 2.918440103530884, + "reward_std": 0.39514149725437164, + "rewards/final_reward": 1.2474688527428803, + "rewards/mask_iou_reward": 0.6237344263714402, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 0.980940043926239, + "rewards/thk_ans_format_reward": 0.96875, + "step": 1914, + "think_completion_length": 45.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.875, + "epoch": 3.2344013490725128, + "grad_norm": 12.041314585208962, + "kl": 0.4609375, + "learning_rate": 3.5413153456998315e-07, + "loss": 0.0005, + "reward": 3.235507011413574, + "reward_std": 0.36636675521731377, + "rewards/final_reward": 1.545750925604584, + "rewards/mask_iou_reward": 0.772875462802292, + "rewards/sam_format_reward": 0.953125, + "rewards/sam_reward_func_ultra": 1.3292570114135742, + "rewards/thk_ans_format_reward": 0.953125, + "step": 1915, + "think_completion_length": 45.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.78125, + "epoch": 3.236087689713322, + "grad_norm": 6.181907315446415, + "kl": 0.58203125, + "learning_rate": 3.5379426644182124e-07, + "loss": 0.0006, + "reward": 3.6867408752441406, + "reward_std": 0.02472075680270791, + "rewards/final_reward": 1.5338553636125272, + "rewards/mask_iou_reward": 0.7669276818062636, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6867409944534302, + "rewards/thk_ans_format_reward": 1.0, + "step": 1916, + "think_completion_length": 48.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.796875, + "epoch": 3.2377740303541316, + "grad_norm": 8.259989325991016, + "kl": 0.62890625, + "learning_rate": 3.534569983136593e-07, + "loss": 0.0006, + "reward": 3.3222023248672485, + "reward_std": 0.10382327809929848, + "rewards/final_reward": 0.95217661103699, + "rewards/mask_iou_reward": 0.476088305518495, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.322202205657959, + "rewards/thk_ans_format_reward": 1.0, + "step": 1917, + "think_completion_length": 43.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.703125, + "epoch": 3.2394603709949408, + "grad_norm": 5.195267131537693, + "kl": 0.51953125, + "learning_rate": 3.5311973018549746e-07, + "loss": 0.0005, + "reward": 3.7313687801361084, + "reward_std": 0.012648439034819603, + "rewards/final_reward": 1.8695688607840397, + "rewards/mask_iou_reward": 0.9347844303920199, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7313688397407532, + "rewards/thk_ans_format_reward": 1.0, + "step": 1918, + "think_completion_length": 47.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.890625, + "epoch": 3.2411467116357504, + "grad_norm": 11.689240947938602, + "kl": 0.5703125, + "learning_rate": 3.527824620573356e-07, + "loss": 0.0006, + "reward": 3.478485584259033, + "reward_std": 0.030393260531127453, + "rewards/final_reward": 1.292362160744934, + "rewards/mask_iou_reward": 0.646181080372467, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.478485643863678, + "rewards/thk_ans_format_reward": 1.0, + "step": 1919, + "think_completion_length": 40.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.171875, + "epoch": 3.24283305227656, + "grad_norm": 8.657308766813404, + "kl": 0.6015625, + "learning_rate": 3.5244519392917364e-07, + "loss": 0.0006, + "reward": 3.202649235725403, + "reward_std": 0.23502523079514503, + "rewards/final_reward": 1.0171185468343586, + "rewards/mask_iou_reward": 0.5085592734171793, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2026492953300476, + "rewards/thk_ans_format_reward": 1.0, + "step": 1920, + "think_completion_length": 44.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.859375, + "epoch": 3.2445193929173692, + "grad_norm": 6.640724330191083, + "kl": 0.6328125, + "learning_rate": 3.521079258010118e-07, + "loss": 0.0006, + "reward": 3.8660894632339478, + "reward_std": 0.013371082721278071, + "rewards/final_reward": 1.9243815967458078, + "rewards/mask_iou_reward": 0.9621907983729039, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8660894632339478, + "rewards/thk_ans_format_reward": 1.0, + "step": 1921, + "think_completion_length": 46.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.421875, + "epoch": 3.246205733558179, + "grad_norm": 15.215046527172118, + "kl": 0.810546875, + "learning_rate": 3.517706576728499e-07, + "loss": 0.0008, + "reward": 3.2143337726593018, + "reward_std": 0.247731015086174, + "rewards/final_reward": 1.1496970084104503, + "rewards/mask_iou_reward": 0.5748485042052252, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2143338322639465, + "rewards/thk_ans_format_reward": 1.0, + "step": 1922, + "think_completion_length": 49.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.5, + "epoch": 3.247892074198988, + "grad_norm": 6.264739124085545, + "kl": 0.55859375, + "learning_rate": 3.51433389544688e-07, + "loss": 0.0006, + "reward": 3.7529088258743286, + "reward_std": 0.008196833077818155, + "rewards/final_reward": 1.7964630349270372, + "rewards/mask_iou_reward": 0.8982315174635186, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7529088854789734, + "rewards/thk_ans_format_reward": 1.0, + "step": 1923, + "think_completion_length": 50.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.765625, + "epoch": 3.2495784148397977, + "grad_norm": 31.375759798091025, + "kl": 0.56640625, + "learning_rate": 3.510961214165261e-07, + "loss": 0.0006, + "reward": 3.684548854827881, + "reward_std": 0.10750999674201012, + "rewards/final_reward": 1.7561618903536844, + "rewards/mask_iou_reward": 0.8780809451768422, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6845490336418152, + "rewards/thk_ans_format_reward": 1.0, + "step": 1924, + "think_completion_length": 50.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.609375, + "epoch": 3.251264755480607, + "grad_norm": 5.912699538453515, + "kl": 0.5703125, + "learning_rate": 3.5075885328836423e-07, + "loss": 0.0006, + "reward": 3.3729045391082764, + "reward_std": 0.0787664633244276, + "rewards/final_reward": 1.3844858612820419, + "rewards/mask_iou_reward": 0.6922429306410209, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3729044795036316, + "rewards/thk_ans_format_reward": 1.0, + "step": 1925, + "think_completion_length": 48.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.828125, + "epoch": 3.2529510961214165, + "grad_norm": 43.59838934618175, + "kl": 0.59765625, + "learning_rate": 3.504215851602024e-07, + "loss": 0.0006, + "reward": 3.7652982473373413, + "reward_std": 0.025617387611418962, + "rewards/final_reward": 1.874484492995093, + "rewards/mask_iou_reward": 0.9372422464975465, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7652981281280518, + "rewards/thk_ans_format_reward": 1.0, + "step": 1926, + "think_completion_length": 51.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.859375, + "epoch": 3.254637436762226, + "grad_norm": 83.00771680277455, + "kl": 0.5419921875, + "learning_rate": 3.5008431703204046e-07, + "loss": 0.0005, + "reward": 3.591688632965088, + "reward_std": 0.11703697592020035, + "rewards/final_reward": 1.6477103820219803, + "rewards/mask_iou_reward": 0.8238551910109901, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5916885137557983, + "rewards/thk_ans_format_reward": 1.0, + "step": 1927, + "think_completion_length": 48.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.015625, + "epoch": 3.2563237774030354, + "grad_norm": 24.512476139760526, + "kl": 0.580078125, + "learning_rate": 3.4974704890387855e-07, + "loss": 0.0006, + "reward": 3.7737098932266235, + "reward_std": 0.012913587968796492, + "rewards/final_reward": 1.8550224840824043, + "rewards/mask_iou_reward": 0.9275112420412022, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7737098336219788, + "rewards/thk_ans_format_reward": 1.0, + "step": 1928, + "think_completion_length": 42.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.0625, + "epoch": 3.258010118043845, + "grad_norm": 19.987296695281866, + "kl": 0.572265625, + "learning_rate": 3.494097807757167e-07, + "loss": 0.0006, + "reward": 3.2878096103668213, + "reward_std": 0.07040636241436005, + "rewards/final_reward": 1.3527816448964711, + "rewards/mask_iou_reward": 0.6763908224482356, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2878096401691437, + "rewards/thk_ans_format_reward": 1.0, + "step": 1929, + "think_completion_length": 47.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.015625, + "epoch": 3.259696458684654, + "grad_norm": 6.978729980497052, + "kl": 0.4814453125, + "learning_rate": 3.490725126475548e-07, + "loss": 0.0005, + "reward": 3.7521612644195557, + "reward_std": 0.039452452678233385, + "rewards/final_reward": 1.8431230070469085, + "rewards/mask_iou_reward": 0.9215615035234542, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7521612048149109, + "rewards/thk_ans_format_reward": 1.0, + "step": 1930, + "think_completion_length": 47.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.359375, + "epoch": 3.261382799325464, + "grad_norm": 10.264759231390057, + "kl": 0.53515625, + "learning_rate": 3.487352445193929e-07, + "loss": 0.0005, + "reward": 3.3465049266815186, + "reward_std": 0.2511683627963066, + "rewards/final_reward": 1.3293050149147185, + "rewards/mask_iou_reward": 0.6646525074573593, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.346504807472229, + "rewards/thk_ans_format_reward": 1.0, + "step": 1931, + "think_completion_length": 54.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.984375, + "epoch": 3.263069139966273, + "grad_norm": 15.61447613519973, + "kl": 0.57421875, + "learning_rate": 3.48397976391231e-07, + "loss": 0.0006, + "reward": 3.618667721748352, + "reward_std": 0.03920717164874077, + "rewards/final_reward": 1.7762699723621134, + "rewards/mask_iou_reward": 0.8881349861810567, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6186676621437073, + "rewards/thk_ans_format_reward": 1.0, + "step": 1932, + "think_completion_length": 46.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.25, + "epoch": 3.2647554806070826, + "grad_norm": 12.672323252058314, + "kl": 0.53515625, + "learning_rate": 3.480607082630691e-07, + "loss": 0.0005, + "reward": 3.5231988430023193, + "reward_std": 0.3485229015350342, + "rewards/final_reward": 1.5344188003096824, + "rewards/mask_iou_reward": 0.7672094001548412, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5231987833976746, + "rewards/thk_ans_format_reward": 1.0, + "step": 1933, + "think_completion_length": 53.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.5, + "epoch": 3.2664418212478923, + "grad_norm": 6.961605791537598, + "kl": 0.544921875, + "learning_rate": 3.4772344013490723e-07, + "loss": 0.0006, + "reward": 3.493443489074707, + "reward_std": 0.015817434526979923, + "rewards/final_reward": 1.3104796870689048, + "rewards/mask_iou_reward": 0.6552398435344524, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4934434294700623, + "rewards/thk_ans_format_reward": 1.0, + "step": 1934, + "think_completion_length": 46.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.546875, + "epoch": 3.2681281618887015, + "grad_norm": 10.4325957223446, + "kl": 0.5546875, + "learning_rate": 3.4738617200674537e-07, + "loss": 0.0006, + "reward": 3.409175992012024, + "reward_std": 0.014063057489693165, + "rewards/final_reward": 1.6345188021142831, + "rewards/mask_iou_reward": 0.8172594010571416, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.409175992012024, + "rewards/thk_ans_format_reward": 1.0, + "step": 1935, + "think_completion_length": 47.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.40625, + "epoch": 3.269814502529511, + "grad_norm": 10.917450013229887, + "kl": 0.92578125, + "learning_rate": 3.470489038785834e-07, + "loss": 0.0009, + "reward": 3.5084145069122314, + "reward_std": 0.035716623067855835, + "rewards/final_reward": 1.4282951391233816, + "rewards/mask_iou_reward": 0.7141475695616908, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5084145665168762, + "rewards/thk_ans_format_reward": 1.0, + "step": 1936, + "think_completion_length": 45.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 227.640625, + "epoch": 3.2715008431703203, + "grad_norm": 11.505594870207323, + "kl": 0.431640625, + "learning_rate": 3.4671163575042155e-07, + "loss": 0.0004, + "reward": 3.2266218662261963, + "reward_std": 0.1628934144973755, + "rewards/final_reward": 0.6567636062319168, + "rewards/mask_iou_reward": 0.3283818031159584, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2266216576099396, + "rewards/thk_ans_format_reward": 1.0, + "step": 1937, + "think_completion_length": 49.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.171875, + "epoch": 3.27318718381113, + "grad_norm": 7.500610170611466, + "kl": 0.521484375, + "learning_rate": 3.463743676222597e-07, + "loss": 0.0005, + "reward": 3.5439982414245605, + "reward_std": 0.16045394260436296, + "rewards/final_reward": 1.6367190095262139, + "rewards/mask_iou_reward": 0.8183595047631069, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5439982414245605, + "rewards/thk_ans_format_reward": 1.0, + "step": 1938, + "think_completion_length": 40.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.25, + "epoch": 3.274873524451939, + "grad_norm": 12.358596568932127, + "kl": 0.603515625, + "learning_rate": 3.4603709949409783e-07, + "loss": 0.0006, + "reward": 3.4929676055908203, + "reward_std": 0.14762873388826847, + "rewards/final_reward": 1.5403731262583937, + "rewards/mask_iou_reward": 0.7701865631291969, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4929676055908203, + "rewards/thk_ans_format_reward": 1.0, + "step": 1939, + "think_completion_length": 51.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.34375, + "epoch": 3.2765598650927488, + "grad_norm": 7.77438392436683, + "kl": 0.5205078125, + "learning_rate": 3.456998313659359e-07, + "loss": 0.0005, + "reward": 3.4649903774261475, + "reward_std": 0.054170895367860794, + "rewards/final_reward": 1.2585329063507007, + "rewards/mask_iou_reward": 0.6292664531753503, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4649905562400818, + "rewards/thk_ans_format_reward": 1.0, + "step": 1940, + "think_completion_length": 51.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.65625, + "epoch": 3.2782462057335584, + "grad_norm": 20.018398131695648, + "kl": 0.5546875, + "learning_rate": 3.45362563237774e-07, + "loss": 0.0006, + "reward": 3.057510256767273, + "reward_std": 0.2664487063884735, + "rewards/final_reward": 1.3169868227485813, + "rewards/mask_iou_reward": 0.6584934113742906, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0575102269649506, + "rewards/thk_ans_format_reward": 1.0, + "step": 1941, + "think_completion_length": 45.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.96875, + "epoch": 3.2799325463743676, + "grad_norm": 21.138072216122055, + "kl": 0.556640625, + "learning_rate": 3.4502529510961214e-07, + "loss": 0.0006, + "reward": 3.3030476570129395, + "reward_std": 0.07455268129706383, + "rewards/final_reward": 1.2251721436715706, + "rewards/mask_iou_reward": 0.6125860718357853, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.303047776222229, + "rewards/thk_ans_format_reward": 1.0, + "step": 1942, + "think_completion_length": 48.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.0625, + "epoch": 3.2816188870151772, + "grad_norm": 13.0634941853992, + "kl": 0.603515625, + "learning_rate": 3.4468802698145023e-07, + "loss": 0.0006, + "reward": 3.625705122947693, + "reward_std": 0.19386066659353673, + "rewards/final_reward": 1.756648771490428, + "rewards/mask_iou_reward": 0.878324385745214, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6257051825523376, + "rewards/thk_ans_format_reward": 1.0, + "step": 1943, + "think_completion_length": 49.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.46875, + "epoch": 3.2833052276559864, + "grad_norm": 12.687196816233865, + "kl": 0.4619140625, + "learning_rate": 3.4435075885328837e-07, + "loss": 0.0005, + "reward": 3.3297626972198486, + "reward_std": 0.06611794698983431, + "rewards/final_reward": 1.8108261193501756, + "rewards/mask_iou_reward": 0.9054130596750878, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3297626972198486, + "rewards/thk_ans_format_reward": 1.0, + "step": 1944, + "think_completion_length": 52.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.625, + "epoch": 3.284991568296796, + "grad_norm": 12.300140965553341, + "kl": 0.5625, + "learning_rate": 3.4401349072512646e-07, + "loss": 0.0006, + "reward": 3.381837844848633, + "reward_std": 0.17056848295032978, + "rewards/final_reward": 1.396779860879466, + "rewards/mask_iou_reward": 0.698389930439733, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.381837785243988, + "rewards/thk_ans_format_reward": 1.0, + "step": 1945, + "think_completion_length": 50.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.953125, + "epoch": 3.2866779089376053, + "grad_norm": 21.44964920998146, + "kl": 0.474609375, + "learning_rate": 3.4367622259696455e-07, + "loss": 0.0005, + "reward": 3.4118212461471558, + "reward_std": 0.11445962265133858, + "rewards/final_reward": 1.1539239197745816, + "rewards/mask_iou_reward": 0.5769619598872908, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4118210673332214, + "rewards/thk_ans_format_reward": 1.0, + "step": 1946, + "think_completion_length": 42.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.8125, + "epoch": 3.288364249578415, + "grad_norm": 37.20110020822072, + "kl": 0.51171875, + "learning_rate": 3.433389544688027e-07, + "loss": 0.0005, + "reward": 3.058582067489624, + "reward_std": 0.17460413463413715, + "rewards/final_reward": 1.126570511549574, + "rewards/mask_iou_reward": 0.563285255774787, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0585820376873016, + "rewards/thk_ans_format_reward": 1.0, + "step": 1947, + "think_completion_length": 49.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.53125, + "epoch": 3.2900505902192245, + "grad_norm": 8.782980390987696, + "kl": 0.779296875, + "learning_rate": 3.4300168634064083e-07, + "loss": 0.0008, + "reward": 3.141714930534363, + "reward_std": 0.13721412606537342, + "rewards/final_reward": 1.3629437232870845, + "rewards/mask_iou_reward": 0.6814718616435422, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1417149901390076, + "rewards/thk_ans_format_reward": 1.0, + "step": 1948, + "think_completion_length": 47.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.28125, + "epoch": 3.2917369308600337, + "grad_norm": 10.802171398935693, + "kl": 0.5390625, + "learning_rate": 3.4266441821247886e-07, + "loss": 0.0005, + "reward": 2.82895827293396, + "reward_std": 0.19724398013204336, + "rewards/final_reward": 0.314011618343195, + "rewards/mask_iou_reward": 0.1570058091715975, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8289581835269928, + "rewards/thk_ans_format_reward": 1.0, + "step": 1949, + "think_completion_length": 48.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.53125, + "epoch": 3.2934232715008434, + "grad_norm": 7.335381485054816, + "kl": 0.64453125, + "learning_rate": 3.42327150084317e-07, + "loss": 0.0006, + "reward": 3.3129823207855225, + "reward_std": 0.05123046040534973, + "rewards/final_reward": 1.3300802900736033, + "rewards/mask_iou_reward": 0.6650401450368016, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3129823207855225, + "rewards/thk_ans_format_reward": 1.0, + "step": 1950, + "think_completion_length": 51.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.28125, + "epoch": 3.2951096121416525, + "grad_norm": 9.971884177129356, + "kl": 0.576171875, + "learning_rate": 3.4198988195615514e-07, + "loss": 0.0006, + "reward": 3.6915000677108765, + "reward_std": 0.07537084259092808, + "rewards/final_reward": 1.7635042987473266, + "rewards/mask_iou_reward": 0.8817521493736633, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6915001273155212, + "rewards/thk_ans_format_reward": 1.0, + "step": 1951, + "think_completion_length": 44.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.453125, + "epoch": 3.296795952782462, + "grad_norm": 13.312910016366194, + "kl": 0.548828125, + "learning_rate": 3.416526138279933e-07, + "loss": 0.0006, + "reward": 3.1335134506225586, + "reward_std": 0.1029847264289856, + "rewards/final_reward": 0.9387565138742173, + "rewards/mask_iou_reward": 0.46937825693710866, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1335134506225586, + "rewards/thk_ans_format_reward": 1.0, + "step": 1952, + "think_completion_length": 46.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.640625, + "epoch": 3.2984822934232714, + "grad_norm": 5.730294323235976, + "kl": 0.541015625, + "learning_rate": 3.413153456998313e-07, + "loss": 0.0006, + "reward": 3.307882785797119, + "reward_std": 0.08197947776352521, + "rewards/final_reward": 0.9149686594569242, + "rewards/mask_iou_reward": 0.4574843297284621, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3078828155994415, + "rewards/thk_ans_format_reward": 1.0, + "step": 1953, + "think_completion_length": 52.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.03125, + "epoch": 3.300168634064081, + "grad_norm": 4.118069432736326, + "kl": 0.52734375, + "learning_rate": 3.4097807757166946e-07, + "loss": 0.0005, + "reward": 3.575540781021118, + "reward_std": 0.09125766530632973, + "rewards/final_reward": 1.7892343078399855, + "rewards/mask_iou_reward": 0.8946171539199927, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5755407810211182, + "rewards/thk_ans_format_reward": 1.0, + "step": 1954, + "think_completion_length": 41.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.921875, + "epoch": 3.30185497470489, + "grad_norm": 5.844672810729986, + "kl": 0.63671875, + "learning_rate": 3.406408094435076e-07, + "loss": 0.0006, + "reward": 3.340492367744446, + "reward_std": 0.09307361952960491, + "rewards/final_reward": 1.3782593471998423, + "rewards/mask_iou_reward": 0.6891296735999212, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3404923677444458, + "rewards/thk_ans_format_reward": 1.0, + "step": 1955, + "think_completion_length": 45.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.640625, + "epoch": 3.3035413153457, + "grad_norm": 8.724432617546066, + "kl": 0.65234375, + "learning_rate": 3.403035413153457e-07, + "loss": 0.0006, + "reward": 3.5463435649871826, + "reward_std": 0.022105058655142784, + "rewards/final_reward": 1.7385080818964855, + "rewards/mask_iou_reward": 0.8692540409482428, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5463436841964722, + "rewards/thk_ans_format_reward": 1.0, + "step": 1956, + "think_completion_length": 46.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.671875, + "epoch": 3.305227655986509, + "grad_norm": 5.245602464263963, + "kl": 0.4326171875, + "learning_rate": 3.3996627318718377e-07, + "loss": 0.0004, + "reward": 3.68087375164032, + "reward_std": 0.20528633147478104, + "rewards/final_reward": 1.6173913805101567, + "rewards/mask_iou_reward": 0.8086956902550784, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6808737516403198, + "rewards/thk_ans_format_reward": 1.0, + "step": 1957, + "think_completion_length": 42.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.15625, + "epoch": 3.3069139966273187, + "grad_norm": 7.913915419345036, + "kl": 0.509765625, + "learning_rate": 3.396290050590219e-07, + "loss": 0.0005, + "reward": 3.4271721839904785, + "reward_std": 0.15432360395789146, + "rewards/final_reward": 1.7015906586002267, + "rewards/mask_iou_reward": 0.8507953293001134, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4271721243858337, + "rewards/thk_ans_format_reward": 1.0, + "step": 1958, + "think_completion_length": 47.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.09375, + "epoch": 3.3086003372681283, + "grad_norm": 16.336278572012315, + "kl": 0.58984375, + "learning_rate": 3.3929173693086e-07, + "loss": 0.0006, + "reward": 3.6032882928848267, + "reward_std": 0.10025950521230698, + "rewards/final_reward": 1.5689837273879075, + "rewards/mask_iou_reward": 0.7844918636939537, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6032883524894714, + "rewards/thk_ans_format_reward": 1.0, + "step": 1959, + "think_completion_length": 44.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.25, + "epoch": 3.3102866779089375, + "grad_norm": 11.476881090573237, + "kl": 0.52734375, + "learning_rate": 3.3895446880269814e-07, + "loss": 0.0005, + "reward": 3.1221102476119995, + "reward_std": 0.5458821058273315, + "rewards/final_reward": 1.2438918568758344, + "rewards/mask_iou_reward": 0.6219459284379172, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1221102476119995, + "rewards/thk_ans_format_reward": 1.0, + "step": 1960, + "think_completion_length": 49.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.078125, + "epoch": 3.311973018549747, + "grad_norm": 7.1417455994906796, + "kl": 0.53515625, + "learning_rate": 3.3861720067453623e-07, + "loss": 0.0005, + "reward": 3.7143653631210327, + "reward_std": 0.1868691765703261, + "rewards/final_reward": 1.569197623422851, + "rewards/mask_iou_reward": 0.7845988117114255, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7143654227256775, + "rewards/thk_ans_format_reward": 1.0, + "step": 1961, + "think_completion_length": 44.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.625, + "epoch": 3.3136593591905563, + "grad_norm": 11.18135115326309, + "kl": 0.556640625, + "learning_rate": 3.382799325463743e-07, + "loss": 0.0006, + "reward": 3.6555434465408325, + "reward_std": 0.026283076032996178, + "rewards/final_reward": 1.713166532577928, + "rewards/mask_iou_reward": 0.856583266288964, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.655543327331543, + "rewards/thk_ans_format_reward": 1.0, + "step": 1962, + "think_completion_length": 47.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.15625, + "epoch": 3.315345699831366, + "grad_norm": 5.682834234008431, + "kl": 0.59765625, + "learning_rate": 3.3794266441821246e-07, + "loss": 0.0005, + "reward": 3.7232742309570312, + "reward_std": 0.004197546397335827, + "rewards/final_reward": 1.612368796690126, + "rewards/mask_iou_reward": 0.806184398345063, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7232744693756104, + "rewards/thk_ans_format_reward": 1.0, + "step": 1963, + "think_completion_length": 43.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.703125, + "epoch": 3.317032040472175, + "grad_norm": 17.08310617062568, + "kl": 0.5625, + "learning_rate": 3.376053962900506e-07, + "loss": 0.0007, + "reward": 3.6794604063034058, + "reward_std": 0.12550297752022743, + "rewards/final_reward": 1.52696537807861, + "rewards/mask_iou_reward": 0.763482689039305, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6794604063034058, + "rewards/thk_ans_format_reward": 1.0, + "step": 1964, + "think_completion_length": 46.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.078125, + "epoch": 3.318718381112985, + "grad_norm": 6.031217553059817, + "kl": 0.5703125, + "learning_rate": 3.3726812816188874e-07, + "loss": 0.0006, + "reward": 3.697144627571106, + "reward_std": 0.05235449317842722, + "rewards/final_reward": 1.5463581091045677, + "rewards/mask_iou_reward": 0.7731790545522839, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.697144627571106, + "rewards/thk_ans_format_reward": 1.0, + "step": 1965, + "think_completion_length": 47.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.8125, + "epoch": 3.3204047217537944, + "grad_norm": 9.929998031126646, + "kl": 0.580078125, + "learning_rate": 3.3693086003372677e-07, + "loss": 0.0006, + "reward": 2.651795506477356, + "reward_std": 0.13870839029550552, + "rewards/final_reward": 0.679500139659625, + "rewards/mask_iou_reward": 0.3397500698298125, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6517956554889679, + "rewards/thk_ans_format_reward": 1.0, + "step": 1966, + "think_completion_length": 41.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.59375, + "epoch": 3.3220910623946036, + "grad_norm": 5.908865902855194, + "kl": 0.51171875, + "learning_rate": 3.365935919055649e-07, + "loss": 0.0005, + "reward": 3.6012171506881714, + "reward_std": 0.03993457509204745, + "rewards/final_reward": 1.517730822907334, + "rewards/mask_iou_reward": 0.758865411453667, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6012172102928162, + "rewards/thk_ans_format_reward": 1.0, + "step": 1967, + "think_completion_length": 47.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.09375, + "epoch": 3.3237774030354132, + "grad_norm": 6.590923867848572, + "kl": 0.576171875, + "learning_rate": 3.3625632377740305e-07, + "loss": 0.0006, + "reward": 3.4035218954086304, + "reward_std": 0.15390025824308395, + "rewards/final_reward": 1.7916255876053522, + "rewards/mask_iou_reward": 0.8958127938026761, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4035218954086304, + "rewards/thk_ans_format_reward": 1.0, + "step": 1968, + "think_completion_length": 45.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.53125, + "epoch": 3.3254637436762224, + "grad_norm": 13.828438711649138, + "kl": 0.478515625, + "learning_rate": 3.3591905564924114e-07, + "loss": 0.0005, + "reward": 3.4149030447006226, + "reward_std": 0.08157273754477501, + "rewards/final_reward": 1.1635934078063273, + "rewards/mask_iou_reward": 0.5817967039031636, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4149029850959778, + "rewards/thk_ans_format_reward": 1.0, + "step": 1969, + "think_completion_length": 47.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.84375, + "epoch": 3.327150084317032, + "grad_norm": 10.066970850691444, + "kl": 0.595703125, + "learning_rate": 3.355817875210792e-07, + "loss": 0.0008, + "reward": 3.6865395307540894, + "reward_std": 0.07062768749892712, + "rewards/final_reward": 1.8212618523933068, + "rewards/mask_iou_reward": 0.9106309261966534, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6865394711494446, + "rewards/thk_ans_format_reward": 1.0, + "step": 1970, + "think_completion_length": 47.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.59375, + "epoch": 3.3288364249578413, + "grad_norm": 8.74797620219619, + "kl": 0.49609375, + "learning_rate": 3.3524451939291737e-07, + "loss": 0.0005, + "reward": 3.249285936355591, + "reward_std": 0.08715942595154047, + "rewards/final_reward": 1.2155885135791953, + "rewards/mask_iou_reward": 0.6077942567895976, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2492859959602356, + "rewards/thk_ans_format_reward": 1.0, + "step": 1971, + "think_completion_length": 42.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.671875, + "epoch": 3.330522765598651, + "grad_norm": 20.092660644475593, + "kl": 0.609375, + "learning_rate": 3.3490725126475545e-07, + "loss": 0.0006, + "reward": 2.717191219329834, + "reward_std": 0.2924363315105438, + "rewards/final_reward": 0.9926286151476688, + "rewards/mask_iou_reward": 0.4963143075738344, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 0.7796911597251892, + "rewards/thk_ans_format_reward": 0.96875, + "step": 1972, + "think_completion_length": 48.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.03125, + "epoch": 3.3322091062394605, + "grad_norm": 18.654057852768332, + "kl": 0.576171875, + "learning_rate": 3.345699831365936e-07, + "loss": 0.0006, + "reward": 3.6741316318511963, + "reward_std": 0.0648178979754448, + "rewards/final_reward": 1.4977320756789554, + "rewards/mask_iou_reward": 0.7488660378394777, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.674131691455841, + "rewards/thk_ans_format_reward": 1.0, + "step": 1973, + "think_completion_length": 47.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.625, + "epoch": 3.3338954468802697, + "grad_norm": 6.74948481793854, + "kl": 0.55859375, + "learning_rate": 3.342327150084317e-07, + "loss": 0.0006, + "reward": 3.2847015857696533, + "reward_std": 0.057460593059659004, + "rewards/final_reward": 0.9617596873774227, + "rewards/mask_iou_reward": 0.48087984368871134, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.284701406955719, + "rewards/thk_ans_format_reward": 1.0, + "step": 1974, + "think_completion_length": 51.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.78125, + "epoch": 3.3355817875210794, + "grad_norm": 25.024892296128783, + "kl": 0.478515625, + "learning_rate": 3.338954468802698e-07, + "loss": 0.0005, + "reward": 3.3408730030059814, + "reward_std": 0.061326127499341965, + "rewards/final_reward": 1.5239473580254677, + "rewards/mask_iou_reward": 0.7619736790127338, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.340872883796692, + "rewards/thk_ans_format_reward": 1.0, + "step": 1975, + "think_completion_length": 48.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.109375, + "epoch": 3.3372681281618886, + "grad_norm": 12.554408530560169, + "kl": 0.5546875, + "learning_rate": 3.335581787521079e-07, + "loss": 0.0005, + "reward": 2.9226410388946533, + "reward_std": 0.2110334150493145, + "rewards/final_reward": 0.45685307238001177, + "rewards/mask_iou_reward": 0.22842653619000589, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9226409792900085, + "rewards/thk_ans_format_reward": 1.0, + "step": 1976, + "think_completion_length": 46.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.71875, + "epoch": 3.338954468802698, + "grad_norm": 9.750316649812305, + "kl": 0.4951171875, + "learning_rate": 3.3322091062394605e-07, + "loss": 0.0005, + "reward": 3.3409875631332397, + "reward_std": 0.10255017504096031, + "rewards/final_reward": 0.9411056049973254, + "rewards/mask_iou_reward": 0.4705528024986627, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3409876227378845, + "rewards/thk_ans_format_reward": 1.0, + "step": 1977, + "think_completion_length": 41.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.1875, + "epoch": 3.3406408094435074, + "grad_norm": 6.12924664923464, + "kl": 0.599609375, + "learning_rate": 3.3288364249578414e-07, + "loss": 0.0006, + "reward": 3.2812459468841553, + "reward_std": 0.033132096752524376, + "rewards/final_reward": 0.9117269831595742, + "rewards/mask_iou_reward": 0.4558634915797871, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2812458276748657, + "rewards/thk_ans_format_reward": 1.0, + "step": 1978, + "think_completion_length": 49.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.40625, + "epoch": 3.342327150084317, + "grad_norm": 12.976404555979629, + "kl": 0.5234375, + "learning_rate": 3.325463743676222e-07, + "loss": 0.0005, + "reward": 3.1836129426956177, + "reward_std": 0.0405933503061533, + "rewards/final_reward": 0.7397104617970125, + "rewards/mask_iou_reward": 0.36985523089850625, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.183612883090973, + "rewards/thk_ans_format_reward": 1.0, + "step": 1979, + "think_completion_length": 49.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.53125, + "epoch": 3.3440134907251267, + "grad_norm": 6.388749541296147, + "kl": 0.578125, + "learning_rate": 3.3220910623946036e-07, + "loss": 0.0006, + "reward": 3.7560627460479736, + "reward_std": 0.01202178793027997, + "rewards/final_reward": 1.8708323976728498, + "rewards/mask_iou_reward": 0.9354161988364249, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.756062626838684, + "rewards/thk_ans_format_reward": 1.0, + "step": 1980, + "think_completion_length": 48.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.59375, + "epoch": 3.345699831365936, + "grad_norm": 45.06176010876029, + "kl": 0.6015625, + "learning_rate": 3.318718381112985e-07, + "loss": 0.0006, + "reward": 3.0571417808532715, + "reward_std": 0.10221374221146107, + "rewards/final_reward": 1.0803357033982843, + "rewards/mask_iou_reward": 0.5401678516991422, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0571418702602386, + "rewards/thk_ans_format_reward": 1.0, + "step": 1981, + "think_completion_length": 43.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.453125, + "epoch": 3.3473861720067455, + "grad_norm": 7.820569038981191, + "kl": 0.591796875, + "learning_rate": 3.3153456998313654e-07, + "loss": 0.0006, + "reward": 3.6929433345794678, + "reward_std": 0.1502692373469472, + "rewards/final_reward": 1.646944216830329, + "rewards/mask_iou_reward": 0.8234721084151645, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6929433345794678, + "rewards/thk_ans_format_reward": 1.0, + "step": 1982, + "think_completion_length": 52.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.3125, + "epoch": 3.3490725126475547, + "grad_norm": 7.5418838910115396, + "kl": 0.61328125, + "learning_rate": 3.311973018549747e-07, + "loss": 0.0006, + "reward": 3.5247732400894165, + "reward_std": 0.04335535317659378, + "rewards/final_reward": 1.846402055210505, + "rewards/mask_iou_reward": 0.9232010276052525, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5247732400894165, + "rewards/thk_ans_format_reward": 1.0, + "step": 1983, + "think_completion_length": 45.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.46875, + "epoch": 3.3507588532883643, + "grad_norm": 6.148037786110945, + "kl": 0.443359375, + "learning_rate": 3.308600337268128e-07, + "loss": 0.0004, + "reward": 3.2120453119277954, + "reward_std": 0.1635238453745842, + "rewards/final_reward": 1.1920128923702205, + "rewards/mask_iou_reward": 0.5960064461851102, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2120453119277954, + "rewards/thk_ans_format_reward": 1.0, + "step": 1984, + "think_completion_length": 41.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.171875, + "epoch": 3.3524451939291735, + "grad_norm": 6.184763419654266, + "kl": 0.568359375, + "learning_rate": 3.305227655986509e-07, + "loss": 0.0006, + "reward": 3.331853151321411, + "reward_std": 0.043349689804017544, + "rewards/final_reward": 1.3908189338518653, + "rewards/mask_iou_reward": 0.6954094669259326, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3318531513214111, + "rewards/thk_ans_format_reward": 1.0, + "step": 1985, + "think_completion_length": 45.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.734375, + "epoch": 3.354131534569983, + "grad_norm": 8.375346934472132, + "kl": 0.455078125, + "learning_rate": 3.30185497470489e-07, + "loss": 0.0005, + "reward": 3.3176496028900146, + "reward_std": 0.08942844346165657, + "rewards/final_reward": 1.2392748999825676, + "rewards/mask_iou_reward": 0.6196374499912838, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.317649632692337, + "rewards/thk_ans_format_reward": 1.0, + "step": 1986, + "think_completion_length": 43.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.984375, + "epoch": 3.3558178752107928, + "grad_norm": 18.287534548610328, + "kl": 0.578125, + "learning_rate": 3.2984822934232713e-07, + "loss": 0.0005, + "reward": 3.329722285270691, + "reward_std": 0.27538200467824936, + "rewards/final_reward": 0.9544553569256847, + "rewards/mask_iou_reward": 0.47722767846284236, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.3609723448753357, + "rewards/thk_ans_format_reward": 0.984375, + "step": 1987, + "think_completion_length": 48.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.875, + "epoch": 3.357504215851602, + "grad_norm": 7.456631450698301, + "kl": 0.560546875, + "learning_rate": 3.295109612141653e-07, + "loss": 0.0006, + "reward": 3.1098419427871704, + "reward_std": 0.17263797670602798, + "rewards/final_reward": 1.04505706196219, + "rewards/mask_iou_reward": 0.522528530981095, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1098419725894928, + "rewards/thk_ans_format_reward": 1.0, + "step": 1988, + "think_completion_length": 43.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.546875, + "epoch": 3.3591905564924116, + "grad_norm": 5.7076069066498025, + "kl": 0.5390625, + "learning_rate": 3.2917369308600336e-07, + "loss": 0.0005, + "reward": 3.655590057373047, + "reward_std": 0.2154662348330021, + "rewards/final_reward": 1.5834517482320138, + "rewards/mask_iou_reward": 0.7917258741160069, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6555900573730469, + "rewards/thk_ans_format_reward": 1.0, + "step": 1989, + "think_completion_length": 42.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.515625, + "epoch": 3.360876897133221, + "grad_norm": 7.505812510113379, + "kl": 0.548828125, + "learning_rate": 3.2883642495784145e-07, + "loss": 0.0005, + "reward": 3.279410719871521, + "reward_std": 0.1372041329741478, + "rewards/final_reward": 1.574243533042608, + "rewards/mask_iou_reward": 0.787121766521304, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2794106602668762, + "rewards/thk_ans_format_reward": 1.0, + "step": 1990, + "think_completion_length": 45.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.765625, + "epoch": 3.3625632377740304, + "grad_norm": 9.130921737801906, + "kl": 0.5546875, + "learning_rate": 3.284991568296796e-07, + "loss": 0.0006, + "reward": 3.6248198747634888, + "reward_std": 0.29395322501659393, + "rewards/final_reward": 1.6349420585259278, + "rewards/mask_iou_reward": 0.8174710292629639, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6248198747634888, + "rewards/thk_ans_format_reward": 1.0, + "step": 1991, + "think_completion_length": 52.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.640625, + "epoch": 3.3642495784148396, + "grad_norm": 19.766241043754633, + "kl": 0.484375, + "learning_rate": 3.281618887015177e-07, + "loss": 0.0005, + "reward": 3.090684175491333, + "reward_std": 0.11629136651754379, + "rewards/final_reward": 0.7971860584649041, + "rewards/mask_iou_reward": 0.39859302923245205, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0906842648983002, + "rewards/thk_ans_format_reward": 1.0, + "step": 1992, + "think_completion_length": 44.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.390625, + "epoch": 3.3659359190556493, + "grad_norm": 84.45935300190781, + "kl": 0.5625, + "learning_rate": 3.278246205733558e-07, + "loss": 0.0006, + "reward": 3.4176278114318848, + "reward_std": 0.12633688002824783, + "rewards/final_reward": 1.6584649862155265, + "rewards/mask_iou_reward": 0.8292324931077633, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.41762775182724, + "rewards/thk_ans_format_reward": 1.0, + "step": 1993, + "think_completion_length": 45.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.90625, + "epoch": 3.367622259696459, + "grad_norm": 8.5894189382737, + "kl": 0.568359375, + "learning_rate": 3.274873524451939e-07, + "loss": 0.0006, + "reward": 3.543907880783081, + "reward_std": 0.0963448672555387, + "rewards/final_reward": 1.799001571553526, + "rewards/mask_iou_reward": 0.899500785776763, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5439078211784363, + "rewards/thk_ans_format_reward": 1.0, + "step": 1994, + "think_completion_length": 42.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.1875, + "epoch": 3.369308600337268, + "grad_norm": 5.872835614458384, + "kl": 0.62890625, + "learning_rate": 3.27150084317032e-07, + "loss": 0.0007, + "reward": 3.2476214170455933, + "reward_std": 0.09465612005442381, + "rewards/final_reward": 1.18733627472457, + "rewards/mask_iou_reward": 0.593668137362285, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2476215362548828, + "rewards/thk_ans_format_reward": 1.0, + "step": 1995, + "think_completion_length": 44.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.21875, + "epoch": 3.3709949409780777, + "grad_norm": 12.968097799327696, + "kl": 0.685546875, + "learning_rate": 3.2681281618887013e-07, + "loss": 0.0007, + "reward": 3.2468059062957764, + "reward_std": 0.14746061153709888, + "rewards/final_reward": 1.023684623301193, + "rewards/mask_iou_reward": 0.5118423116505965, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2468059062957764, + "rewards/thk_ans_format_reward": 1.0, + "step": 1996, + "think_completion_length": 41.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.71875, + "epoch": 3.372681281618887, + "grad_norm": 14.993651185819589, + "kl": 0.615234375, + "learning_rate": 3.264755480607083e-07, + "loss": 0.0006, + "reward": 2.8943779468536377, + "reward_std": 0.1070544458925724, + "rewards/final_reward": 1.6384638413116381, + "rewards/mask_iou_reward": 0.8192319206558191, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8943780958652496, + "rewards/thk_ans_format_reward": 1.0, + "step": 1997, + "think_completion_length": 47.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.09375, + "epoch": 3.3743676222596966, + "grad_norm": 4.5367073345316555, + "kl": 0.5166015625, + "learning_rate": 3.261382799325463e-07, + "loss": 0.0005, + "reward": 3.6099237203598022, + "reward_std": 0.07347086956724524, + "rewards/final_reward": 1.8372419839056675, + "rewards/mask_iou_reward": 0.9186209919528338, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6099236011505127, + "rewards/thk_ans_format_reward": 1.0, + "step": 1998, + "think_completion_length": 35.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.015625, + "epoch": 3.3760539629005057, + "grad_norm": 7.102930347949191, + "kl": 0.615234375, + "learning_rate": 3.2580101180438445e-07, + "loss": 0.0006, + "reward": 3.5327283143997192, + "reward_std": 0.14749955013394356, + "rewards/final_reward": 1.8147781307339632, + "rewards/mask_iou_reward": 0.9073890653669816, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5327282547950745, + "rewards/thk_ans_format_reward": 1.0, + "step": 1999, + "think_completion_length": 39.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.171875, + "epoch": 3.3777403035413154, + "grad_norm": 9.942553625687975, + "kl": 0.537109375, + "learning_rate": 3.254637436762226e-07, + "loss": 0.0005, + "reward": 3.547389268875122, + "reward_std": 0.1251727119088173, + "rewards/final_reward": 1.511609798042504, + "rewards/mask_iou_reward": 0.755804899021252, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5473893284797668, + "rewards/thk_ans_format_reward": 1.0, + "step": 2000, + "think_completion_length": 42.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.984375, + "epoch": 3.379426644182125, + "grad_norm": 24.911534805683672, + "kl": 1.146484375, + "learning_rate": 3.2512647554806073e-07, + "loss": 0.0011, + "reward": 3.4626386165618896, + "reward_std": 0.06342816725373268, + "rewards/final_reward": 1.3687977350358613, + "rewards/mask_iou_reward": 0.6843988675179307, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4626386761665344, + "rewards/thk_ans_format_reward": 1.0, + "step": 2001, + "think_completion_length": 44.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.46875, + "epoch": 3.381112984822934, + "grad_norm": 63.71804719258427, + "kl": 0.5546875, + "learning_rate": 3.247892074198988e-07, + "loss": 0.0006, + "reward": 3.0394468307495117, + "reward_std": 0.27142253518104553, + "rewards/final_reward": 1.1951839934034023, + "rewards/mask_iou_reward": 0.5975919967017012, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0394466519355774, + "rewards/thk_ans_format_reward": 1.0, + "step": 2002, + "think_completion_length": 45.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.609375, + "epoch": 3.382799325463744, + "grad_norm": 10.90467501879512, + "kl": 0.57421875, + "learning_rate": 3.244519392917369e-07, + "loss": 0.0006, + "reward": 3.1326643228530884, + "reward_std": 0.041215680539608, + "rewards/final_reward": 0.9408250897246508, + "rewards/mask_iou_reward": 0.4704125448623254, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1326642036437988, + "rewards/thk_ans_format_reward": 1.0, + "step": 2003, + "think_completion_length": 44.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.6875, + "epoch": 3.384485666104553, + "grad_norm": 10.49759767307761, + "kl": 0.568359375, + "learning_rate": 3.2411467116357504e-07, + "loss": 0.0006, + "reward": 3.1833359003067017, + "reward_std": 0.08077399618923664, + "rewards/final_reward": 1.5277296883693114, + "rewards/mask_iou_reward": 0.7638648441846557, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1833359003067017, + "rewards/thk_ans_format_reward": 1.0, + "step": 2004, + "think_completion_length": 46.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.3125, + "epoch": 3.3861720067453627, + "grad_norm": 69.54637223102522, + "kl": 0.6015625, + "learning_rate": 3.2377740303541313e-07, + "loss": 0.0006, + "reward": 3.186821937561035, + "reward_std": 0.07137523218989372, + "rewards/final_reward": 1.4912311188442389, + "rewards/mask_iou_reward": 0.7456155594221194, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.186821848154068, + "rewards/thk_ans_format_reward": 1.0, + "step": 2005, + "think_completion_length": 49.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.375, + "epoch": 3.387858347386172, + "grad_norm": 9.035577647308175, + "kl": 0.80078125, + "learning_rate": 3.2344013490725127e-07, + "loss": 0.0008, + "reward": 3.420803427696228, + "reward_std": 0.18055840581655502, + "rewards/final_reward": 1.6420055230666772, + "rewards/mask_iou_reward": 0.8210027615333386, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4208033084869385, + "rewards/thk_ans_format_reward": 1.0, + "step": 2006, + "think_completion_length": 51.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.5, + "epoch": 3.3895446880269815, + "grad_norm": 8.342453926202667, + "kl": 0.736328125, + "learning_rate": 3.2310286677908936e-07, + "loss": 0.0007, + "reward": 3.167693018913269, + "reward_std": 0.07003648579120636, + "rewards/final_reward": 1.6847967851713777, + "rewards/mask_iou_reward": 0.8423983925856888, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1676931977272034, + "rewards/thk_ans_format_reward": 1.0, + "step": 2007, + "think_completion_length": 41.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.8125, + "epoch": 3.391231028667791, + "grad_norm": 7.629456776734595, + "kl": 0.6328125, + "learning_rate": 3.2276559865092745e-07, + "loss": 0.0006, + "reward": 3.3508166074752808, + "reward_std": 0.12238148972392082, + "rewards/final_reward": 1.0337033341593962, + "rewards/mask_iou_reward": 0.5168516670796981, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.350816547870636, + "rewards/thk_ans_format_reward": 1.0, + "step": 2008, + "think_completion_length": 42.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.34375, + "epoch": 3.3929173693086003, + "grad_norm": 7.234444379509983, + "kl": 0.66796875, + "learning_rate": 3.224283305227656e-07, + "loss": 0.0007, + "reward": 3.4166321754455566, + "reward_std": 0.23674443364143372, + "rewards/final_reward": 1.1408130644550725, + "rewards/mask_iou_reward": 0.5704065322275362, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4166321754455566, + "rewards/thk_ans_format_reward": 1.0, + "step": 2009, + "think_completion_length": 44.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.984375, + "epoch": 3.39460370994941, + "grad_norm": 7.506944229344649, + "kl": 0.58203125, + "learning_rate": 3.2209106239460373e-07, + "loss": 0.0006, + "reward": 3.1270315647125244, + "reward_std": 0.11961232125759125, + "rewards/final_reward": 0.9250940028783735, + "rewards/mask_iou_reward": 0.46254700143918676, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1270316243171692, + "rewards/thk_ans_format_reward": 1.0, + "step": 2010, + "think_completion_length": 50.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.09375, + "epoch": 3.396290050590219, + "grad_norm": 5.081626548226494, + "kl": 0.5185546875, + "learning_rate": 3.2175379426644176e-07, + "loss": 0.0005, + "reward": 3.4909207820892334, + "reward_std": 0.12029211595654488, + "rewards/final_reward": 1.5802509611939328, + "rewards/mask_iou_reward": 0.7901254805969664, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4909207820892334, + "rewards/thk_ans_format_reward": 1.0, + "step": 2011, + "think_completion_length": 41.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.03125, + "epoch": 3.397976391231029, + "grad_norm": 7.583470125843207, + "kl": 0.5419921875, + "learning_rate": 3.214165261382799e-07, + "loss": 0.0005, + "reward": 3.3409262895584106, + "reward_std": 0.14934771042317152, + "rewards/final_reward": 1.3885411302330193, + "rewards/mask_iou_reward": 0.6942705651165096, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3409262895584106, + "rewards/thk_ans_format_reward": 1.0, + "step": 2012, + "think_completion_length": 42.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.9375, + "epoch": 3.399662731871838, + "grad_norm": 8.621093358210288, + "kl": 0.537109375, + "learning_rate": 3.2107925801011804e-07, + "loss": 0.0005, + "reward": 3.067779302597046, + "reward_std": 0.27544330805540085, + "rewards/final_reward": 0.8974693611197712, + "rewards/mask_iou_reward": 0.4487346805598856, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0677792429924011, + "rewards/thk_ans_format_reward": 1.0, + "step": 2013, + "think_completion_length": 44.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.9375, + "epoch": 3.4013490725126476, + "grad_norm": 10.964660941089583, + "kl": 0.6328125, + "learning_rate": 3.207419898819562e-07, + "loss": 0.0006, + "reward": 3.358281373977661, + "reward_std": 0.06170746497809887, + "rewards/final_reward": 0.8549180381163555, + "rewards/mask_iou_reward": 0.42745901905817774, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3582815527915955, + "rewards/thk_ans_format_reward": 1.0, + "step": 2014, + "think_completion_length": 41.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.8125, + "epoch": 3.403035413153457, + "grad_norm": 6.593952928176873, + "kl": 0.5068359375, + "learning_rate": 3.204047217537942e-07, + "loss": 0.0005, + "reward": 3.2059460878372192, + "reward_std": 0.1293669156730175, + "rewards/final_reward": 1.1842376119410463, + "rewards/mask_iou_reward": 0.5921188059705231, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2059460878372192, + "rewards/thk_ans_format_reward": 1.0, + "step": 2015, + "think_completion_length": 44.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.859375, + "epoch": 3.4047217537942664, + "grad_norm": 16.12581705453544, + "kl": 0.60546875, + "learning_rate": 3.2006745362563236e-07, + "loss": 0.0006, + "reward": 3.2048619985580444, + "reward_std": 0.17501818388700485, + "rewards/final_reward": 1.3144665586082775, + "rewards/mask_iou_reward": 0.6572332793041388, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2048619985580444, + "rewards/thk_ans_format_reward": 1.0, + "step": 2016, + "think_completion_length": 41.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.265625, + "epoch": 3.4064080944350756, + "grad_norm": 5.37399183205758, + "kl": 0.552734375, + "learning_rate": 3.197301854974705e-07, + "loss": 0.0006, + "reward": 3.5387697219848633, + "reward_std": 0.011349121574312449, + "rewards/final_reward": 1.845533327452562, + "rewards/mask_iou_reward": 0.922766663726281, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.538769781589508, + "rewards/thk_ans_format_reward": 1.0, + "step": 2017, + "think_completion_length": 43.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.296875, + "epoch": 3.4080944350758853, + "grad_norm": 25.972066541966154, + "kl": 0.4873046875, + "learning_rate": 3.193929173693086e-07, + "loss": 0.0005, + "reward": 3.243216633796692, + "reward_std": 0.43901199474930763, + "rewards/final_reward": 1.404129992277193, + "rewards/mask_iou_reward": 0.7020649961385965, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 1.305716633796692, + "rewards/thk_ans_format_reward": 0.96875, + "step": 2018, + "think_completion_length": 39.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.609375, + "epoch": 3.409780775716695, + "grad_norm": 12.503372878276222, + "kl": 0.546875, + "learning_rate": 3.1905564924114667e-07, + "loss": 0.0005, + "reward": 3.233471155166626, + "reward_std": 0.021571812219917774, + "rewards/final_reward": 0.977786239537827, + "rewards/mask_iou_reward": 0.4888931197689135, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.233471155166626, + "rewards/thk_ans_format_reward": 1.0, + "step": 2019, + "think_completion_length": 49.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.125, + "epoch": 3.411467116357504, + "grad_norm": 9.279611653873177, + "kl": 0.552734375, + "learning_rate": 3.187183811129848e-07, + "loss": 0.0006, + "reward": 3.442598581314087, + "reward_std": 0.22078751027584076, + "rewards/final_reward": 1.3976067600815076, + "rewards/mask_iou_reward": 0.6988033800407538, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4425984621047974, + "rewards/thk_ans_format_reward": 1.0, + "step": 2020, + "think_completion_length": 39.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.765625, + "epoch": 3.4131534569983137, + "grad_norm": 5.152016285443864, + "kl": 0.4921875, + "learning_rate": 3.183811129848229e-07, + "loss": 0.0005, + "reward": 3.3022444248199463, + "reward_std": 0.07596256211400032, + "rewards/final_reward": 1.1442824262621978, + "rewards/mask_iou_reward": 0.5721412131310989, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3022443056106567, + "rewards/thk_ans_format_reward": 1.0, + "step": 2021, + "think_completion_length": 45.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.1875, + "epoch": 3.414839797639123, + "grad_norm": 71.59325436651602, + "kl": 0.591796875, + "learning_rate": 3.1804384485666104e-07, + "loss": 0.0006, + "reward": 2.967165946960449, + "reward_std": 0.06337304785847664, + "rewards/final_reward": 1.3569264816968474, + "rewards/mask_iou_reward": 0.6784632408484237, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9671659171581268, + "rewards/thk_ans_format_reward": 1.0, + "step": 2022, + "think_completion_length": 44.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.203125, + "epoch": 3.4165261382799326, + "grad_norm": 5.560328479002505, + "kl": 0.619140625, + "learning_rate": 3.1770657672849913e-07, + "loss": 0.0006, + "reward": 2.8386834859848022, + "reward_std": 0.013690002728253603, + "rewards/final_reward": 0.0, + "rewards/mask_iou_reward": 0.0, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.838683545589447, + "rewards/thk_ans_format_reward": 1.0, + "step": 2023, + "think_completion_length": 45.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.109375, + "epoch": 3.4182124789207418, + "grad_norm": 10.56040792391872, + "kl": 0.6484375, + "learning_rate": 3.173693086003372e-07, + "loss": 0.0006, + "reward": 2.722140312194824, + "reward_std": 0.06009296700358391, + "rewards/final_reward": 1.0703529767006847, + "rewards/mask_iou_reward": 0.5351764883503424, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7221404016017914, + "rewards/thk_ans_format_reward": 1.0, + "step": 2024, + "think_completion_length": 45.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.03125, + "epoch": 3.4198988195615514, + "grad_norm": 19.433490902830165, + "kl": 0.8984375, + "learning_rate": 3.1703204047217536e-07, + "loss": 0.0009, + "reward": 3.1550711393356323, + "reward_std": 0.5463046324439347, + "rewards/final_reward": 1.3991503312967488, + "rewards/mask_iou_reward": 0.6995751656483744, + "rewards/sam_format_reward": 0.9375, + "rewards/sam_reward_func_ultra": 1.280071198940277, + "rewards/thk_ans_format_reward": 0.9375, + "step": 2025, + "think_completion_length": 47.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.765625, + "epoch": 3.421585160202361, + "grad_norm": 17.87867964050926, + "kl": 0.6328125, + "learning_rate": 3.166947723440135e-07, + "loss": 0.0006, + "reward": 3.1133947372436523, + "reward_std": 0.109176866710186, + "rewards/final_reward": 1.1625815169505365, + "rewards/mask_iou_reward": 0.5812907584752682, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1133947968482971, + "rewards/thk_ans_format_reward": 1.0, + "step": 2026, + "think_completion_length": 45.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.0, + "epoch": 3.4232715008431702, + "grad_norm": 10.094195931918332, + "kl": 0.55859375, + "learning_rate": 3.1635750421585164e-07, + "loss": 0.0006, + "reward": 3.1443214416503906, + "reward_std": 0.06616199389100075, + "rewards/final_reward": 0.677497794267151, + "rewards/mask_iou_reward": 0.3387488971335755, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1443215012550354, + "rewards/thk_ans_format_reward": 1.0, + "step": 2027, + "think_completion_length": 42.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.171875, + "epoch": 3.42495784148398, + "grad_norm": 5.596862017138307, + "kl": 0.552734375, + "learning_rate": 3.1602023608768967e-07, + "loss": 0.0006, + "reward": 2.66789174079895, + "reward_std": 0.3597968891263008, + "rewards/final_reward": 0.12884763875733346, + "rewards/mask_iou_reward": 0.06442381937866673, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.667891800403595, + "rewards/thk_ans_format_reward": 1.0, + "step": 2028, + "think_completion_length": 42.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.140625, + "epoch": 3.426644182124789, + "grad_norm": 5.723799357111686, + "kl": 0.568359375, + "learning_rate": 3.156829679595278e-07, + "loss": 0.0006, + "reward": 3.7234787940979004, + "reward_std": 0.034357505617663264, + "rewards/final_reward": 1.5354280173268653, + "rewards/mask_iou_reward": 0.7677140086634326, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7234787940979004, + "rewards/thk_ans_format_reward": 1.0, + "step": 2029, + "think_completion_length": 43.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.84375, + "epoch": 3.4283305227655987, + "grad_norm": 25.451963993287492, + "kl": 0.59375, + "learning_rate": 3.1534569983136595e-07, + "loss": 0.0006, + "reward": 3.073733687400818, + "reward_std": 0.2858571792021394, + "rewards/final_reward": 0.6562628353296543, + "rewards/mask_iou_reward": 0.3281314176648272, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.073733627796173, + "rewards/thk_ans_format_reward": 1.0, + "step": 2030, + "think_completion_length": 41.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.359375, + "epoch": 3.430016863406408, + "grad_norm": 8.554774241678064, + "kl": 0.552734375, + "learning_rate": 3.1500843170320404e-07, + "loss": 0.0005, + "reward": 3.527369976043701, + "reward_std": 0.20771950855851173, + "rewards/final_reward": 1.6726139266029252, + "rewards/mask_iou_reward": 0.8363069633014626, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5273699164390564, + "rewards/thk_ans_format_reward": 1.0, + "step": 2031, + "think_completion_length": 44.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.1875, + "epoch": 3.4317032040472175, + "grad_norm": 12.03124597768691, + "kl": 0.634765625, + "learning_rate": 3.146711635750421e-07, + "loss": 0.0006, + "reward": 3.084221363067627, + "reward_std": 0.14684276282787323, + "rewards/final_reward": 1.286284375341669, + "rewards/mask_iou_reward": 0.6431421876708345, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0842213034629822, + "rewards/thk_ans_format_reward": 1.0, + "step": 2032, + "think_completion_length": 45.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.96875, + "epoch": 3.433389544688027, + "grad_norm": 7.535228675402098, + "kl": 0.5205078125, + "learning_rate": 3.1433389544688027e-07, + "loss": 0.0005, + "reward": 3.4276663064956665, + "reward_std": 0.13675907254219055, + "rewards/final_reward": 1.650958786137227, + "rewards/mask_iou_reward": 0.8254793930686135, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4276662468910217, + "rewards/thk_ans_format_reward": 1.0, + "step": 2033, + "think_completion_length": 40.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.3125, + "epoch": 3.4350758853288363, + "grad_norm": 10.269806157418087, + "kl": 0.60546875, + "learning_rate": 3.1399662731871835e-07, + "loss": 0.0006, + "reward": 3.8080811500549316, + "reward_std": 0.016493337228894234, + "rewards/final_reward": 1.7206912997820765, + "rewards/mask_iou_reward": 0.8603456498910382, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8080810904502869, + "rewards/thk_ans_format_reward": 1.0, + "step": 2034, + "think_completion_length": 46.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.640625, + "epoch": 3.436762225969646, + "grad_norm": 87.29511846158675, + "kl": 0.5078125, + "learning_rate": 3.136593591905565e-07, + "loss": 0.0005, + "reward": 3.03287672996521, + "reward_std": 0.12340293824672699, + "rewards/final_reward": 1.4833359764083096, + "rewards/mask_iou_reward": 0.7416679882041548, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0328766703605652, + "rewards/thk_ans_format_reward": 1.0, + "step": 2035, + "think_completion_length": 40.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.4375, + "epoch": 3.438448566610455, + "grad_norm": 5.596236022765671, + "kl": 0.671875, + "learning_rate": 3.133220910623946e-07, + "loss": 0.0007, + "reward": 3.6413198709487915, + "reward_std": 0.041382129304111004, + "rewards/final_reward": 1.4613938088920493, + "rewards/mask_iou_reward": 0.7306969044460246, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6413196921348572, + "rewards/thk_ans_format_reward": 1.0, + "step": 2036, + "think_completion_length": 45.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.125, + "epoch": 3.440134907251265, + "grad_norm": 6.71781932368155, + "kl": 0.59375, + "learning_rate": 3.129848229342327e-07, + "loss": 0.0006, + "reward": 3.355802297592163, + "reward_std": 0.20005353540182114, + "rewards/final_reward": 1.2739943643089002, + "rewards/mask_iou_reward": 0.6369971821544501, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3558022379875183, + "rewards/thk_ans_format_reward": 1.0, + "step": 2037, + "think_completion_length": 39.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.296875, + "epoch": 3.441821247892074, + "grad_norm": 27.916419315586456, + "kl": 0.525390625, + "learning_rate": 3.126475548060708e-07, + "loss": 0.0005, + "reward": 3.251969575881958, + "reward_std": 0.18746953457593918, + "rewards/final_reward": 1.3600471383699275, + "rewards/mask_iou_reward": 0.6800235691849638, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2519696354866028, + "rewards/thk_ans_format_reward": 1.0, + "step": 2038, + "think_completion_length": 39.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.9375, + "epoch": 3.4435075885328836, + "grad_norm": 7.2430728989528275, + "kl": 0.66015625, + "learning_rate": 3.1231028667790895e-07, + "loss": 0.0007, + "reward": 3.1523733139038086, + "reward_std": 0.3076479956507683, + "rewards/final_reward": 1.5067852862366253, + "rewards/mask_iou_reward": 0.7533926431183127, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.167998194694519, + "rewards/thk_ans_format_reward": 1.0, + "step": 2039, + "think_completion_length": 43.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.953125, + "epoch": 3.4451939291736933, + "grad_norm": 4.783105232050546, + "kl": 0.509765625, + "learning_rate": 3.1197301854974704e-07, + "loss": 0.0005, + "reward": 3.708993911743164, + "reward_std": 0.05525432340800762, + "rewards/final_reward": 1.8121373095830768, + "rewards/mask_iou_reward": 0.9060686547915384, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7089937329292297, + "rewards/thk_ans_format_reward": 1.0, + "step": 2040, + "think_completion_length": 41.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.34375, + "epoch": 3.4468802698145025, + "grad_norm": 8.11655541151862, + "kl": 0.6875, + "learning_rate": 3.116357504215851e-07, + "loss": 0.0007, + "reward": 3.1580730676651, + "reward_std": 0.08902622014284134, + "rewards/final_reward": 1.2540957808126234, + "rewards/mask_iou_reward": 0.6270478904063117, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1580730378627777, + "rewards/thk_ans_format_reward": 1.0, + "step": 2041, + "think_completion_length": 47.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.8125, + "epoch": 3.448566610455312, + "grad_norm": 5.143790494484055, + "kl": 0.546875, + "learning_rate": 3.1129848229342326e-07, + "loss": 0.0005, + "reward": 3.1901299953460693, + "reward_std": 0.058364099357277155, + "rewards/final_reward": 0.6773605533276124, + "rewards/mask_iou_reward": 0.3386802766638062, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1901300251483917, + "rewards/thk_ans_format_reward": 1.0, + "step": 2042, + "think_completion_length": 42.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.4375, + "epoch": 3.4502529510961213, + "grad_norm": 7.525608066964579, + "kl": 0.55078125, + "learning_rate": 3.109612141652614e-07, + "loss": 0.0006, + "reward": 3.572661876678467, + "reward_std": 0.022327865473926067, + "rewards/final_reward": 1.7100408556340767, + "rewards/mask_iou_reward": 0.8550204278170384, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5726619958877563, + "rewards/thk_ans_format_reward": 1.0, + "step": 2043, + "think_completion_length": 47.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.328125, + "epoch": 3.451939291736931, + "grad_norm": 16.392322277931605, + "kl": 0.603515625, + "learning_rate": 3.1062394603709944e-07, + "loss": 0.0006, + "reward": 3.355070114135742, + "reward_std": 0.06577342934906483, + "rewards/final_reward": 1.3422343824513452, + "rewards/mask_iou_reward": 0.6711171912256726, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3550702929496765, + "rewards/thk_ans_format_reward": 1.0, + "step": 2044, + "think_completion_length": 44.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.046875, + "epoch": 3.45362563237774, + "grad_norm": 16.5911277595788, + "kl": 0.564453125, + "learning_rate": 3.102866779089376e-07, + "loss": 0.0006, + "reward": 3.096205711364746, + "reward_std": 0.23673050850629807, + "rewards/final_reward": 1.0256573336674095, + "rewards/mask_iou_reward": 0.5128286668337048, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.096205621957779, + "rewards/thk_ans_format_reward": 1.0, + "step": 2045, + "think_completion_length": 43.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.109375, + "epoch": 3.4553119730185498, + "grad_norm": 9.168801627251366, + "kl": 0.775390625, + "learning_rate": 3.099494097807757e-07, + "loss": 0.0008, + "reward": 3.2718621492385864, + "reward_std": 0.02805233560502529, + "rewards/final_reward": 1.1143052034870835, + "rewards/mask_iou_reward": 0.5571526017435418, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2718620896339417, + "rewards/thk_ans_format_reward": 1.0, + "step": 2046, + "think_completion_length": 42.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.09375, + "epoch": 3.4569983136593594, + "grad_norm": 9.580867499703865, + "kl": 0.57421875, + "learning_rate": 3.096121416526138e-07, + "loss": 0.0006, + "reward": 3.309259295463562, + "reward_std": 0.021198630332946777, + "rewards/final_reward": 1.6347819329427153, + "rewards/mask_iou_reward": 0.8173909664713577, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.309259295463562, + "rewards/thk_ans_format_reward": 1.0, + "step": 2047, + "think_completion_length": 47.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.78125, + "epoch": 3.4586846543001686, + "grad_norm": 11.641386424264883, + "kl": 0.54296875, + "learning_rate": 3.092748735244519e-07, + "loss": 0.0005, + "reward": 3.419318437576294, + "reward_std": 0.1268703443929553, + "rewards/final_reward": 1.1149699646904263, + "rewards/mask_iou_reward": 0.5574849823452132, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4193184971809387, + "rewards/thk_ans_format_reward": 1.0, + "step": 2048, + "think_completion_length": 40.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.203125, + "epoch": 3.460370994940978, + "grad_norm": 10.457163008805031, + "kl": 0.591796875, + "learning_rate": 3.0893760539629004e-07, + "loss": 0.0006, + "reward": 3.4929925203323364, + "reward_std": 0.06658563949167728, + "rewards/final_reward": 1.3797770463612256, + "rewards/mask_iou_reward": 0.6898885231806128, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4929924607276917, + "rewards/thk_ans_format_reward": 1.0, + "step": 2049, + "think_completion_length": 44.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.8125, + "epoch": 3.4620573355817874, + "grad_norm": 12.992427945172517, + "kl": 0.57421875, + "learning_rate": 3.086003372681282e-07, + "loss": 0.0006, + "reward": 3.475190281867981, + "reward_std": 0.1516074314713478, + "rewards/final_reward": 1.3570006034645838, + "rewards/mask_iou_reward": 0.6785003017322919, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4751903414726257, + "rewards/thk_ans_format_reward": 1.0, + "step": 2050, + "think_completion_length": 41.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.421875, + "epoch": 3.463743676222597, + "grad_norm": 10.236935710119583, + "kl": 0.68359375, + "learning_rate": 3.0826306913996626e-07, + "loss": 0.0007, + "reward": 3.1847715377807617, + "reward_std": 0.059085357934236526, + "rewards/final_reward": 0.7592109866888326, + "rewards/mask_iou_reward": 0.3796054933444163, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.184771478176117, + "rewards/thk_ans_format_reward": 1.0, + "step": 2051, + "think_completion_length": 44.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.65625, + "epoch": 3.4654300168634062, + "grad_norm": 67.39013973113407, + "kl": 0.572265625, + "learning_rate": 3.0792580101180435e-07, + "loss": 0.0006, + "reward": 3.023639678955078, + "reward_std": 0.22101550735533237, + "rewards/final_reward": 0.8818784059611318, + "rewards/mask_iou_reward": 0.4409392029805659, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0236395299434662, + "rewards/thk_ans_format_reward": 1.0, + "step": 2052, + "think_completion_length": 48.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.390625, + "epoch": 3.467116357504216, + "grad_norm": 6.259382879220525, + "kl": 0.517578125, + "learning_rate": 3.075885328836425e-07, + "loss": 0.0005, + "reward": 3.081355333328247, + "reward_std": 0.004808083031093702, + "rewards/final_reward": 0.26189683733133445, + "rewards/mask_iou_reward": 0.13094841866566723, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0813553929328918, + "rewards/thk_ans_format_reward": 1.0, + "step": 2053, + "think_completion_length": 49.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.015625, + "epoch": 3.4688026981450255, + "grad_norm": 12.992020712397627, + "kl": 0.521484375, + "learning_rate": 3.072512647554806e-07, + "loss": 0.0005, + "reward": 3.3531652688980103, + "reward_std": 0.10442159976810217, + "rewards/final_reward": 1.236209052600922, + "rewards/mask_iou_reward": 0.618104526300461, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.353165328502655, + "rewards/thk_ans_format_reward": 1.0, + "step": 2054, + "think_completion_length": 46.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.28125, + "epoch": 3.4704890387858347, + "grad_norm": 20.979004051163514, + "kl": 0.65234375, + "learning_rate": 3.069139966273187e-07, + "loss": 0.0007, + "reward": 3.363083004951477, + "reward_std": 0.19395306333899498, + "rewards/final_reward": 1.5368581470637899, + "rewards/mask_iou_reward": 0.7684290735318949, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.363083004951477, + "rewards/thk_ans_format_reward": 1.0, + "step": 2055, + "think_completion_length": 44.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.125, + "epoch": 3.4721753794266443, + "grad_norm": 4.559033940459481, + "kl": 0.564453125, + "learning_rate": 3.0657672849915686e-07, + "loss": 0.0005, + "reward": 3.707147717475891, + "reward_std": 0.0350824692286551, + "rewards/final_reward": 1.6528962266813516, + "rewards/mask_iou_reward": 0.8264481133406758, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7071477174758911, + "rewards/thk_ans_format_reward": 1.0, + "step": 2056, + "think_completion_length": 48.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.421875, + "epoch": 3.4738617200674535, + "grad_norm": 20.549147936309854, + "kl": 0.53515625, + "learning_rate": 3.062394603709949e-07, + "loss": 0.0005, + "reward": 2.985813856124878, + "reward_std": 0.0662859920412302, + "rewards/final_reward": 0.8053970687702292, + "rewards/mask_iou_reward": 0.4026985343851146, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9858137965202332, + "rewards/thk_ans_format_reward": 1.0, + "step": 2057, + "think_completion_length": 46.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.84375, + "epoch": 3.475548060708263, + "grad_norm": 9.984510333726956, + "kl": 0.5546875, + "learning_rate": 3.0590219224283303e-07, + "loss": 0.0006, + "reward": 2.868474006652832, + "reward_std": 0.028040415607392788, + "rewards/final_reward": 0.15886921552929686, + "rewards/mask_iou_reward": 0.07943460776464843, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8684740662574768, + "rewards/thk_ans_format_reward": 1.0, + "step": 2058, + "think_completion_length": 50.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.09375, + "epoch": 3.4772344013490724, + "grad_norm": 8.995561128045066, + "kl": 0.478515625, + "learning_rate": 3.055649241146712e-07, + "loss": 0.0005, + "reward": 3.6218186616897583, + "reward_std": 0.07758180983364582, + "rewards/final_reward": 1.6858512783896944, + "rewards/mask_iou_reward": 0.8429256391948472, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.621818482875824, + "rewards/thk_ans_format_reward": 1.0, + "step": 2059, + "think_completion_length": 47.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.875, + "epoch": 3.478920741989882, + "grad_norm": 13.009018434002423, + "kl": 0.55078125, + "learning_rate": 3.0522765598650926e-07, + "loss": 0.0005, + "reward": 3.158493995666504, + "reward_std": 0.31041108816862106, + "rewards/final_reward": 1.4331862329200882, + "rewards/mask_iou_reward": 0.7165931164600441, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1584938764572144, + "rewards/thk_ans_format_reward": 1.0, + "step": 2060, + "think_completion_length": 44.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.46875, + "epoch": 3.4806070826306916, + "grad_norm": 12.992123348597211, + "kl": 0.5703125, + "learning_rate": 3.0489038785834735e-07, + "loss": 0.0006, + "reward": 3.362663745880127, + "reward_std": 0.03213449893519282, + "rewards/final_reward": 1.0460226018486092, + "rewards/mask_iou_reward": 0.5230113009243046, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3626638054847717, + "rewards/thk_ans_format_reward": 1.0, + "step": 2061, + "think_completion_length": 41.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.421875, + "epoch": 3.482293423271501, + "grad_norm": 49.926997659929846, + "kl": 0.529296875, + "learning_rate": 3.045531197301855e-07, + "loss": 0.0005, + "reward": 3.6378896236419678, + "reward_std": 0.1999678835272789, + "rewards/final_reward": 1.5644028089646156, + "rewards/mask_iou_reward": 0.7822014044823078, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.637889802455902, + "rewards/thk_ans_format_reward": 1.0, + "step": 2062, + "think_completion_length": 45.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.6875, + "epoch": 3.4839797639123105, + "grad_norm": 17.026423042191496, + "kl": 0.576171875, + "learning_rate": 3.0421585160202363e-07, + "loss": 0.0006, + "reward": 3.0796077251434326, + "reward_std": 0.15330487489700317, + "rewards/final_reward": 1.2737249290539945, + "rewards/mask_iou_reward": 0.6368624645269972, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0796076208353043, + "rewards/thk_ans_format_reward": 1.0, + "step": 2063, + "think_completion_length": 47.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.296875, + "epoch": 3.4856661045531196, + "grad_norm": 7.000696048676124, + "kl": 0.544921875, + "learning_rate": 3.038785834738617e-07, + "loss": 0.0005, + "reward": 3.6956236362457275, + "reward_std": 0.06873153895139694, + "rewards/final_reward": 1.710976314726648, + "rewards/mask_iou_reward": 0.855488157363324, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6956236958503723, + "rewards/thk_ans_format_reward": 1.0, + "step": 2064, + "think_completion_length": 43.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.34375, + "epoch": 3.4873524451939293, + "grad_norm": 15.18804859176664, + "kl": 0.56640625, + "learning_rate": 3.035413153456998e-07, + "loss": 0.0006, + "reward": 3.263370156288147, + "reward_std": 0.18835239857435226, + "rewards/final_reward": 1.274296605476248, + "rewards/mask_iou_reward": 0.637148302738124, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2633703351020813, + "rewards/thk_ans_format_reward": 1.0, + "step": 2065, + "think_completion_length": 45.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.578125, + "epoch": 3.4890387858347385, + "grad_norm": 9.809415007636295, + "kl": 0.513671875, + "learning_rate": 3.0320404721753794e-07, + "loss": 0.0005, + "reward": 3.234055280685425, + "reward_std": 0.1404377743601799, + "rewards/final_reward": 0.9628708190382448, + "rewards/mask_iou_reward": 0.4814354095191224, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2340553104877472, + "rewards/thk_ans_format_reward": 1.0, + "step": 2066, + "think_completion_length": 47.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.015625, + "epoch": 3.490725126475548, + "grad_norm": 18.435724356020913, + "kl": 0.517578125, + "learning_rate": 3.0286677908937603e-07, + "loss": 0.0005, + "reward": 3.07632577419281, + "reward_std": 0.12655201670713723, + "rewards/final_reward": 1.6996674723771896, + "rewards/mask_iou_reward": 0.8498337361885948, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0763257145881653, + "rewards/thk_ans_format_reward": 1.0, + "step": 2067, + "think_completion_length": 43.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.765625, + "epoch": 3.4924114671163577, + "grad_norm": 8.139177398384799, + "kl": 0.53125, + "learning_rate": 3.0252951096121417e-07, + "loss": 0.0006, + "reward": 3.680861234664917, + "reward_std": 0.017464175820350647, + "rewards/final_reward": 1.9384369036629767, + "rewards/mask_iou_reward": 0.9692184518314884, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6808614134788513, + "rewards/thk_ans_format_reward": 1.0, + "step": 2068, + "think_completion_length": 45.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.03125, + "epoch": 3.494097807757167, + "grad_norm": 6.302239880949431, + "kl": 0.560546875, + "learning_rate": 3.0219224283305226e-07, + "loss": 0.0006, + "reward": 3.526781678199768, + "reward_std": 0.08949761837720871, + "rewards/final_reward": 1.5264771402068722, + "rewards/mask_iou_reward": 0.7632385701034361, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5267817974090576, + "rewards/thk_ans_format_reward": 1.0, + "step": 2069, + "think_completion_length": 45.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.09375, + "epoch": 3.4957841483979766, + "grad_norm": 10.267958861421658, + "kl": 0.56640625, + "learning_rate": 3.0185497470489035e-07, + "loss": 0.0006, + "reward": 3.438549757003784, + "reward_std": 0.05048087425529957, + "rewards/final_reward": 1.1073131439090815, + "rewards/mask_iou_reward": 0.5536565719545408, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4385497570037842, + "rewards/thk_ans_format_reward": 1.0, + "step": 2070, + "think_completion_length": 45.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.5, + "epoch": 3.4974704890387858, + "grad_norm": 9.64767925245289, + "kl": 0.59375, + "learning_rate": 3.015177065767285e-07, + "loss": 0.0006, + "reward": 3.215143084526062, + "reward_std": 0.332039512693882, + "rewards/final_reward": 1.2518763445173295, + "rewards/mask_iou_reward": 0.6259381722586648, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.215143084526062, + "rewards/thk_ans_format_reward": 1.0, + "step": 2071, + "think_completion_length": 46.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.640625, + "epoch": 3.4991568296795954, + "grad_norm": 11.15703283424711, + "kl": 0.595703125, + "learning_rate": 3.0118043844856663e-07, + "loss": 0.0006, + "reward": 3.348411202430725, + "reward_std": 0.022082495968788862, + "rewards/final_reward": 0.8214227307899277, + "rewards/mask_iou_reward": 0.41071136539496383, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3484110832214355, + "rewards/thk_ans_format_reward": 1.0, + "step": 2072, + "think_completion_length": 50.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.625, + "epoch": 3.5008431703204046, + "grad_norm": 9.69937959345132, + "kl": 0.5234375, + "learning_rate": 3.0084317032040466e-07, + "loss": 0.0005, + "reward": 3.2116652727127075, + "reward_std": 0.2600217703729868, + "rewards/final_reward": 1.3683783973116639, + "rewards/mask_iou_reward": 0.6841891986558319, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.2429153323173523, + "rewards/thk_ans_format_reward": 0.984375, + "step": 2073, + "think_completion_length": 48.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.703125, + "epoch": 3.5025295109612142, + "grad_norm": 10.062390866369451, + "kl": 0.501953125, + "learning_rate": 3.005059021922428e-07, + "loss": 0.0005, + "reward": 3.4098644256591797, + "reward_std": 0.15565502271056175, + "rewards/final_reward": 1.2738018206229793, + "rewards/mask_iou_reward": 0.6369009103114897, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4098644256591797, + "rewards/thk_ans_format_reward": 1.0, + "step": 2074, + "think_completion_length": 48.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.96875, + "epoch": 3.504215851602024, + "grad_norm": 15.480097162634573, + "kl": 0.5146484375, + "learning_rate": 3.0016863406408094e-07, + "loss": 0.0005, + "reward": 3.035758376121521, + "reward_std": 0.14913135021924973, + "rewards/final_reward": 1.049987013645798, + "rewards/mask_iou_reward": 0.524993506822899, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0357585549354553, + "rewards/thk_ans_format_reward": 1.0, + "step": 2075, + "think_completion_length": 47.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.71875, + "epoch": 3.505902192242833, + "grad_norm": 7.040263739913607, + "kl": 0.58203125, + "learning_rate": 2.998313659359191e-07, + "loss": 0.0006, + "reward": 3.4088956117630005, + "reward_std": 0.12369058793410659, + "rewards/final_reward": 1.4691792207676442, + "rewards/mask_iou_reward": 0.7345896103838221, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4088955521583557, + "rewards/thk_ans_format_reward": 1.0, + "step": 2076, + "think_completion_length": 46.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.109375, + "epoch": 3.5075885328836423, + "grad_norm": 8.947858317432594, + "kl": 0.53515625, + "learning_rate": 2.994940978077571e-07, + "loss": 0.0005, + "reward": 3.2634823322296143, + "reward_std": 0.2896201773546636, + "rewards/final_reward": 1.3893840332991332, + "rewards/mask_iou_reward": 0.6946920166495666, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2634823322296143, + "rewards/thk_ans_format_reward": 1.0, + "step": 2077, + "think_completion_length": 44.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.046875, + "epoch": 3.509274873524452, + "grad_norm": 11.935600740389882, + "kl": 0.57421875, + "learning_rate": 2.9915682967959526e-07, + "loss": 0.0006, + "reward": 3.2649558782577515, + "reward_std": 0.41319380700588226, + "rewards/final_reward": 1.4942187138608924, + "rewards/mask_iou_reward": 0.7471093569304462, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2649559378623962, + "rewards/thk_ans_format_reward": 1.0, + "step": 2078, + "think_completion_length": 46.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.890625, + "epoch": 3.5109612141652615, + "grad_norm": 9.365130805074894, + "kl": 0.576171875, + "learning_rate": 2.988195615514334e-07, + "loss": 0.0006, + "reward": 3.0650811195373535, + "reward_std": 0.06612196192145348, + "rewards/final_reward": 0.7571907244586673, + "rewards/mask_iou_reward": 0.37859536222933365, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0650811791419983, + "rewards/thk_ans_format_reward": 1.0, + "step": 2079, + "think_completion_length": 49.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.765625, + "epoch": 3.5126475548060707, + "grad_norm": 8.560600749245339, + "kl": 0.505859375, + "learning_rate": 2.984822934232715e-07, + "loss": 0.0005, + "reward": 3.578374981880188, + "reward_std": 0.3707886040210724, + "rewards/final_reward": 1.6643127275437786, + "rewards/mask_iou_reward": 0.8321563637718893, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5783750414848328, + "rewards/thk_ans_format_reward": 1.0, + "step": 2080, + "think_completion_length": 45.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.015625, + "epoch": 3.5143338954468804, + "grad_norm": 7.321228522701317, + "kl": 0.548828125, + "learning_rate": 2.9814502529510957e-07, + "loss": 0.0005, + "reward": 2.8969147205352783, + "reward_std": 0.21056237444281578, + "rewards/final_reward": 1.0241504374650594, + "rewards/mask_iou_reward": 0.5120752187325297, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.89691461622715, + "rewards/thk_ans_format_reward": 1.0, + "step": 2081, + "think_completion_length": 45.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.328125, + "epoch": 3.51602023608769, + "grad_norm": 23.63771581498818, + "kl": 0.611328125, + "learning_rate": 2.978077571669477e-07, + "loss": 0.0006, + "reward": 3.10478937625885, + "reward_std": 0.08834952488541603, + "rewards/final_reward": 0.9242398913792284, + "rewards/mask_iou_reward": 0.4621199456896142, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1047892272472382, + "rewards/thk_ans_format_reward": 1.0, + "step": 2082, + "think_completion_length": 42.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.828125, + "epoch": 3.517706576728499, + "grad_norm": 8.000584246347364, + "kl": 0.6640625, + "learning_rate": 2.974704890387858e-07, + "loss": 0.0006, + "reward": 3.3999558687210083, + "reward_std": 0.1016635000705719, + "rewards/final_reward": 1.271658266660058, + "rewards/mask_iou_reward": 0.635829133330029, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.399955689907074, + "rewards/thk_ans_format_reward": 1.0, + "step": 2083, + "think_completion_length": 46.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.296875, + "epoch": 3.5193929173693084, + "grad_norm": 6.863503915469555, + "kl": 0.677734375, + "learning_rate": 2.9713322091062394e-07, + "loss": 0.0007, + "reward": 3.251119613647461, + "reward_std": 0.05742297577671707, + "rewards/final_reward": 0.8811621420140534, + "rewards/mask_iou_reward": 0.4405810710070267, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2511195540428162, + "rewards/thk_ans_format_reward": 1.0, + "step": 2084, + "think_completion_length": 45.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.46875, + "epoch": 3.521079258010118, + "grad_norm": 10.149551426716297, + "kl": 0.578125, + "learning_rate": 2.967959527824621e-07, + "loss": 0.0006, + "reward": 3.618879556655884, + "reward_std": 0.1719725530128926, + "rewards/final_reward": 1.7024318654775046, + "rewards/mask_iou_reward": 0.8512159327387523, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6188794374465942, + "rewards/thk_ans_format_reward": 1.0, + "step": 2085, + "think_completion_length": 42.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.328125, + "epoch": 3.5227655986509276, + "grad_norm": 11.229567750401436, + "kl": 0.53515625, + "learning_rate": 2.964586846543001e-07, + "loss": 0.0005, + "reward": 3.2309197187423706, + "reward_std": 0.06877991370856762, + "rewards/final_reward": 0.8686235580294994, + "rewards/mask_iou_reward": 0.4343117790147497, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2309198379516602, + "rewards/thk_ans_format_reward": 1.0, + "step": 2086, + "think_completion_length": 48.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.140625, + "epoch": 3.524451939291737, + "grad_norm": 8.016667921124851, + "kl": 0.59375, + "learning_rate": 2.9612141652613826e-07, + "loss": 0.0006, + "reward": 3.398813247680664, + "reward_std": 0.2085051666945219, + "rewards/final_reward": 1.6540931938829095, + "rewards/mask_iou_reward": 0.8270465969414548, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3988131284713745, + "rewards/thk_ans_format_reward": 1.0, + "step": 2087, + "think_completion_length": 41.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.78125, + "epoch": 3.5261382799325465, + "grad_norm": 8.287625638806832, + "kl": 0.5234375, + "learning_rate": 2.957841483979764e-07, + "loss": 0.0005, + "reward": 2.9071192741394043, + "reward_std": 0.3184506855905056, + "rewards/final_reward": 0.16841956742067227, + "rewards/mask_iou_reward": 0.08420978371033613, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9071193039417267, + "rewards/thk_ans_format_reward": 1.0, + "step": 2088, + "think_completion_length": 44.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.34375, + "epoch": 3.5278246205733557, + "grad_norm": 8.820546084984757, + "kl": 0.9140625, + "learning_rate": 2.9544688026981454e-07, + "loss": 0.0009, + "reward": 3.3051774501800537, + "reward_std": 0.057986740954220295, + "rewards/final_reward": 1.616677442440604, + "rewards/mask_iou_reward": 0.808338721220302, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3051775097846985, + "rewards/thk_ans_format_reward": 1.0, + "step": 2089, + "think_completion_length": 47.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.234375, + "epoch": 3.5295109612141653, + "grad_norm": 6.267811168411706, + "kl": 0.552734375, + "learning_rate": 2.9510961214165257e-07, + "loss": 0.0006, + "reward": 2.8216934204101562, + "reward_std": 0.16679880395531654, + "rewards/final_reward": 0.12753961957074536, + "rewards/mask_iou_reward": 0.06376980978537268, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8216933906078339, + "rewards/thk_ans_format_reward": 1.0, + "step": 2090, + "think_completion_length": 47.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.171875, + "epoch": 3.5311973018549745, + "grad_norm": 23.750264428638094, + "kl": 0.583984375, + "learning_rate": 2.947723440134907e-07, + "loss": 0.0006, + "reward": 3.4246087074279785, + "reward_std": 0.2074672132730484, + "rewards/final_reward": 1.331872885254827, + "rewards/mask_iou_reward": 0.6659364426274135, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.424608588218689, + "rewards/thk_ans_format_reward": 1.0, + "step": 2091, + "think_completion_length": 44.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.203125, + "epoch": 3.532883642495784, + "grad_norm": 27.6083076016815, + "kl": 0.57421875, + "learning_rate": 2.9443507588532885e-07, + "loss": 0.0006, + "reward": 3.294243812561035, + "reward_std": 0.05477495677769184, + "rewards/final_reward": 1.044439194811737, + "rewards/mask_iou_reward": 0.5222195974058685, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2942437827587128, + "rewards/thk_ans_format_reward": 1.0, + "step": 2092, + "think_completion_length": 52.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.171875, + "epoch": 3.5345699831365938, + "grad_norm": 11.86318269840444, + "kl": 2.19140625, + "learning_rate": 2.9409780775716694e-07, + "loss": 0.0022, + "reward": 3.617302179336548, + "reward_std": 0.19097769260406494, + "rewards/final_reward": 1.4741462494303408, + "rewards/mask_iou_reward": 0.7370731247151704, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.617302119731903, + "rewards/thk_ans_format_reward": 1.0, + "step": 2093, + "think_completion_length": 42.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.359375, + "epoch": 3.536256323777403, + "grad_norm": 33.15216981502035, + "kl": 0.5703125, + "learning_rate": 2.9376053962900503e-07, + "loss": 0.0006, + "reward": 3.1339809894561768, + "reward_std": 0.12357348203659058, + "rewards/final_reward": 0.784641664259337, + "rewards/mask_iou_reward": 0.3923208321296685, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.133980929851532, + "rewards/thk_ans_format_reward": 1.0, + "step": 2094, + "think_completion_length": 42.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.78125, + "epoch": 3.5379426644182126, + "grad_norm": 7.771322430647064, + "kl": 0.533203125, + "learning_rate": 2.9342327150084317e-07, + "loss": 0.0005, + "reward": 3.645893096923828, + "reward_std": 0.17055297642946243, + "rewards/final_reward": 1.548257813426118, + "rewards/mask_iou_reward": 0.774128906713059, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6458930969238281, + "rewards/thk_ans_format_reward": 1.0, + "step": 2095, + "think_completion_length": 45.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.890625, + "epoch": 3.539629005059022, + "grad_norm": 16.288824310059873, + "kl": 0.5546875, + "learning_rate": 2.9308600337268125e-07, + "loss": 0.0006, + "reward": 3.142457604408264, + "reward_std": 0.24793227389454842, + "rewards/final_reward": 1.0366077066693755, + "rewards/mask_iou_reward": 0.5183038533346878, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1424575448036194, + "rewards/thk_ans_format_reward": 1.0, + "step": 2096, + "think_completion_length": 47.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.578125, + "epoch": 3.5413153456998314, + "grad_norm": 7.628982481255148, + "kl": 0.607421875, + "learning_rate": 2.927487352445194e-07, + "loss": 0.0006, + "reward": 3.7804569005966187, + "reward_std": 0.09163061529397964, + "rewards/final_reward": 1.869259836081693, + "rewards/mask_iou_reward": 0.9346299180408465, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7804570198059082, + "rewards/thk_ans_format_reward": 1.0, + "step": 2097, + "think_completion_length": 48.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.65625, + "epoch": 3.5430016863406406, + "grad_norm": 6.984260855630186, + "kl": 0.5859375, + "learning_rate": 2.924114671163575e-07, + "loss": 0.0006, + "reward": 3.530802369117737, + "reward_std": 0.38594286143779755, + "rewards/final_reward": 1.2610482569802852, + "rewards/mask_iou_reward": 0.6305241284901426, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.5464274883270264, + "rewards/thk_ans_format_reward": 1.0, + "step": 2098, + "think_completion_length": 47.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.3125, + "epoch": 3.5446880269814502, + "grad_norm": 14.513378746218695, + "kl": 0.580078125, + "learning_rate": 2.9207419898819557e-07, + "loss": 0.0006, + "reward": 3.3536789417266846, + "reward_std": 0.06767284963279963, + "rewards/final_reward": 1.630382387472711, + "rewards/mask_iou_reward": 0.8151911937363555, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3536791801452637, + "rewards/thk_ans_format_reward": 1.0, + "step": 2099, + "think_completion_length": 48.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.53125, + "epoch": 3.54637436762226, + "grad_norm": 34.519418419568595, + "kl": 0.494140625, + "learning_rate": 2.917369308600337e-07, + "loss": 0.0005, + "reward": 3.3592960834503174, + "reward_std": 0.10136066749691963, + "rewards/final_reward": 0.8570358125793107, + "rewards/mask_iou_reward": 0.42851790628965536, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3592960834503174, + "rewards/thk_ans_format_reward": 1.0, + "step": 2100, + "think_completion_length": 51.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.390625, + "epoch": 3.548060708263069, + "grad_norm": 4.852262057713968, + "kl": 0.5546875, + "learning_rate": 2.9139966273187185e-07, + "loss": 0.0006, + "reward": 2.8021715879440308, + "reward_std": 0.014212753623723984, + "rewards/final_reward": 0.7047118073511891, + "rewards/mask_iou_reward": 0.35235590367559455, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8021714687347412, + "rewards/thk_ans_format_reward": 1.0, + "step": 2101, + "think_completion_length": 45.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.96875, + "epoch": 3.5497470489038787, + "grad_norm": 70.12061063949474, + "kl": 0.55859375, + "learning_rate": 2.9106239460370994e-07, + "loss": 0.0006, + "reward": 3.3605268001556396, + "reward_std": 0.2995372787117958, + "rewards/final_reward": 1.2963307667207167, + "rewards/mask_iou_reward": 0.6481653833603583, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3605269193649292, + "rewards/thk_ans_format_reward": 1.0, + "step": 2102, + "think_completion_length": 50.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.09375, + "epoch": 3.551433389544688, + "grad_norm": 11.10089122007049, + "kl": 0.544921875, + "learning_rate": 2.90725126475548e-07, + "loss": 0.0005, + "reward": 3.38996422290802, + "reward_std": 0.15020517259836197, + "rewards/final_reward": 1.464758414052994, + "rewards/mask_iou_reward": 0.732379207026497, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.38996422290802, + "rewards/thk_ans_format_reward": 1.0, + "step": 2103, + "think_completion_length": 47.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.265625, + "epoch": 3.5531197301854975, + "grad_norm": 13.02278757136033, + "kl": 0.55078125, + "learning_rate": 2.9038785834738617e-07, + "loss": 0.0006, + "reward": 3.147314429283142, + "reward_std": 0.14261020720005035, + "rewards/final_reward": 1.344845371331086, + "rewards/mask_iou_reward": 0.672422685665543, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1473142802715302, + "rewards/thk_ans_format_reward": 1.0, + "step": 2104, + "think_completion_length": 41.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.78125, + "epoch": 3.5548060708263067, + "grad_norm": 11.260745371599246, + "kl": 0.5166015625, + "learning_rate": 2.900505902192243e-07, + "loss": 0.0006, + "reward": 3.1299134492874146, + "reward_std": 0.04871807433664799, + "rewards/final_reward": 1.3929734948617756, + "rewards/mask_iou_reward": 0.6964867474308878, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1299134194850922, + "rewards/thk_ans_format_reward": 1.0, + "step": 2105, + "think_completion_length": 50.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.59375, + "epoch": 3.5564924114671164, + "grad_norm": 14.253061236679251, + "kl": 0.525390625, + "learning_rate": 2.8971332209106234e-07, + "loss": 0.0005, + "reward": 3.40402615070343, + "reward_std": 0.23270705668255687, + "rewards/final_reward": 1.2606720556852906, + "rewards/mask_iou_reward": 0.6303360278426453, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4040260016918182, + "rewards/thk_ans_format_reward": 1.0, + "step": 2106, + "think_completion_length": 48.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.125, + "epoch": 3.558178752107926, + "grad_norm": 9.729457631519049, + "kl": 0.443359375, + "learning_rate": 2.893760539629005e-07, + "loss": 0.0004, + "reward": 3.751418352127075, + "reward_std": 0.25814956426620483, + "rewards/final_reward": 1.8125, + "rewards/mask_iou_reward": 0.90625, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7514183521270752, + "rewards/thk_ans_format_reward": 1.0, + "step": 2107, + "think_completion_length": 44.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.625, + "epoch": 3.559865092748735, + "grad_norm": 9.294937389808833, + "kl": 0.55859375, + "learning_rate": 2.890387858347386e-07, + "loss": 0.0006, + "reward": 3.4791066646575928, + "reward_std": 0.26504068821668625, + "rewards/final_reward": 1.3581809205721262, + "rewards/mask_iou_reward": 0.6790904602860631, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4791066646575928, + "rewards/thk_ans_format_reward": 1.0, + "step": 2108, + "think_completion_length": 47.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.734375, + "epoch": 3.561551433389545, + "grad_norm": 10.856018909508016, + "kl": 0.568359375, + "learning_rate": 2.887015177065767e-07, + "loss": 0.0006, + "reward": 3.142184615135193, + "reward_std": 0.06791120395064354, + "rewards/final_reward": 1.3566191082046042, + "rewards/mask_iou_reward": 0.6783095541023021, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1421846747398376, + "rewards/thk_ans_format_reward": 1.0, + "step": 2109, + "think_completion_length": 48.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.125, + "epoch": 3.563237774030354, + "grad_norm": 6.750033064551902, + "kl": 0.55859375, + "learning_rate": 2.883642495784148e-07, + "loss": 0.0006, + "reward": 3.1395514011383057, + "reward_std": 0.1944444328546524, + "rewards/final_reward": 1.059547095767144, + "rewards/mask_iou_reward": 0.529773547883572, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1395514905452728, + "rewards/thk_ans_format_reward": 1.0, + "step": 2110, + "think_completion_length": 44.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.5625, + "epoch": 3.5649241146711637, + "grad_norm": 8.25354027902421, + "kl": 0.517578125, + "learning_rate": 2.8802698145025294e-07, + "loss": 0.0006, + "reward": 3.530856966972351, + "reward_std": 0.1734582866774872, + "rewards/final_reward": 1.8629343088128367, + "rewards/mask_iou_reward": 0.9314671544064184, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5308570265769958, + "rewards/thk_ans_format_reward": 1.0, + "step": 2111, + "think_completion_length": 46.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.53125, + "epoch": 3.566610455311973, + "grad_norm": 14.605550783250727, + "kl": 0.533203125, + "learning_rate": 2.876897133220911e-07, + "loss": 0.0006, + "reward": 3.368571400642395, + "reward_std": 0.017685976112261415, + "rewards/final_reward": 1.5584769869884578, + "rewards/mask_iou_reward": 0.7792384934942289, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3685712814331055, + "rewards/thk_ans_format_reward": 1.0, + "step": 2112, + "think_completion_length": 41.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.59375, + "epoch": 3.5682967959527825, + "grad_norm": 10.667209261447056, + "kl": 0.5078125, + "learning_rate": 2.8735244519392916e-07, + "loss": 0.0005, + "reward": 2.9554500579833984, + "reward_std": 0.12431775592267513, + "rewards/final_reward": 0.3111894509782488, + "rewards/mask_iou_reward": 0.1555947254891244, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9554500877857208, + "rewards/thk_ans_format_reward": 1.0, + "step": 2113, + "think_completion_length": 44.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.5, + "epoch": 3.569983136593592, + "grad_norm": 7.334798139651193, + "kl": 0.537109375, + "learning_rate": 2.870151770657673e-07, + "loss": 0.0005, + "reward": 3.0563929080963135, + "reward_std": 0.16057665273547173, + "rewards/final_reward": 0.77717497274694, + "rewards/mask_iou_reward": 0.38858748637347, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0563929378986359, + "rewards/thk_ans_format_reward": 1.0, + "step": 2114, + "think_completion_length": 48.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.53125, + "epoch": 3.5716694772344013, + "grad_norm": 9.965742861168454, + "kl": 0.513671875, + "learning_rate": 2.866779089376054e-07, + "loss": 0.0005, + "reward": 3.605561137199402, + "reward_std": 0.06425703875720501, + "rewards/final_reward": 1.7104030906984011, + "rewards/mask_iou_reward": 0.8552015453492006, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6055611371994019, + "rewards/thk_ans_format_reward": 1.0, + "step": 2115, + "think_completion_length": 42.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.859375, + "epoch": 3.573355817875211, + "grad_norm": 10.606011190407303, + "kl": 0.5078125, + "learning_rate": 2.863406408094435e-07, + "loss": 0.0005, + "reward": 3.346148729324341, + "reward_std": 0.08017969503998756, + "rewards/final_reward": 1.2055786473120078, + "rewards/mask_iou_reward": 0.6027893236560039, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3461489081382751, + "rewards/thk_ans_format_reward": 1.0, + "step": 2116, + "think_completion_length": 45.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.078125, + "epoch": 3.57504215851602, + "grad_norm": 16.286723500192668, + "kl": 0.5068359375, + "learning_rate": 2.860033726812816e-07, + "loss": 0.0005, + "reward": 3.3839882612228394, + "reward_std": 0.0907436553388834, + "rewards/final_reward": 1.8187623688096652, + "rewards/mask_iou_reward": 0.9093811844048326, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.383988231420517, + "rewards/thk_ans_format_reward": 1.0, + "step": 2117, + "think_completion_length": 43.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.90625, + "epoch": 3.5767284991568298, + "grad_norm": 7.10774667990878, + "kl": 0.521484375, + "learning_rate": 2.8566610455311976e-07, + "loss": 0.0005, + "reward": 3.3456366062164307, + "reward_std": 0.03418039623647928, + "rewards/final_reward": 1.7127510501139098, + "rewards/mask_iou_reward": 0.8563755250569549, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3456366062164307, + "rewards/thk_ans_format_reward": 1.0, + "step": 2118, + "think_completion_length": 44.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.109375, + "epoch": 3.578414839797639, + "grad_norm": 13.503292561770218, + "kl": 0.55859375, + "learning_rate": 2.853288364249578e-07, + "loss": 0.0006, + "reward": 3.763441801071167, + "reward_std": 0.032608107663691044, + "rewards/final_reward": 1.7397966034509587, + "rewards/mask_iou_reward": 0.8698983017254793, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.763441801071167, + "rewards/thk_ans_format_reward": 1.0, + "step": 2119, + "think_completion_length": 44.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.234375, + "epoch": 3.5801011804384486, + "grad_norm": 12.169735293725726, + "kl": 0.61328125, + "learning_rate": 2.8499156829679593e-07, + "loss": 0.0006, + "reward": 2.9567670822143555, + "reward_std": 0.3452972024679184, + "rewards/final_reward": 0.9493169032549391, + "rewards/mask_iou_reward": 0.47465845162746956, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9567671567201614, + "rewards/thk_ans_format_reward": 1.0, + "step": 2120, + "think_completion_length": 45.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.390625, + "epoch": 3.5817875210792582, + "grad_norm": 72.34801198564061, + "kl": 0.595703125, + "learning_rate": 2.846543001686341e-07, + "loss": 0.0006, + "reward": 3.4123685359954834, + "reward_std": 0.07814565277658403, + "rewards/final_reward": 1.441254424751838, + "rewards/mask_iou_reward": 0.720627212375919, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4123685359954834, + "rewards/thk_ans_format_reward": 1.0, + "step": 2121, + "think_completion_length": 46.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.046875, + "epoch": 3.5834738617200674, + "grad_norm": 18.001668662737146, + "kl": 0.595703125, + "learning_rate": 2.8431703204047216e-07, + "loss": 0.0006, + "reward": 3.4009355306625366, + "reward_std": 0.226658396422863, + "rewards/final_reward": 1.5326358044587276, + "rewards/mask_iou_reward": 0.7663179022293638, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.4165604710578918, + "rewards/thk_ans_format_reward": 1.0, + "step": 2122, + "think_completion_length": 49.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.734375, + "epoch": 3.5851602023608766, + "grad_norm": 6.756132574804486, + "kl": 0.501953125, + "learning_rate": 2.8397976391231025e-07, + "loss": 0.0005, + "reward": 3.0923595428466797, + "reward_std": 0.4571046978235245, + "rewards/final_reward": 0.6847192336418062, + "rewards/mask_iou_reward": 0.3423596168209031, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0923596620559692, + "rewards/thk_ans_format_reward": 1.0, + "step": 2123, + "think_completion_length": 44.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.71875, + "epoch": 3.5868465430016863, + "grad_norm": 4.699953491289403, + "kl": 0.443359375, + "learning_rate": 2.836424957841484e-07, + "loss": 0.0004, + "reward": 3.352581024169922, + "reward_std": 0.12979009747505188, + "rewards/final_reward": 1.2676309486607986, + "rewards/mask_iou_reward": 0.6338154743303993, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3525812029838562, + "rewards/thk_ans_format_reward": 1.0, + "step": 2124, + "think_completion_length": 40.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.734375, + "epoch": 3.588532883642496, + "grad_norm": 9.479851297811361, + "kl": 0.564453125, + "learning_rate": 2.8330522765598653e-07, + "loss": 0.0006, + "reward": 3.3169244527816772, + "reward_std": 0.07373960688710213, + "rewards/final_reward": 1.3375584167549102, + "rewards/mask_iou_reward": 0.6687792083774551, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.316924512386322, + "rewards/thk_ans_format_reward": 1.0, + "step": 2125, + "think_completion_length": 44.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.59375, + "epoch": 3.590219224283305, + "grad_norm": 9.675427623917646, + "kl": 0.6015625, + "learning_rate": 2.829679595278246e-07, + "loss": 0.0006, + "reward": 3.5323774814605713, + "reward_std": 0.021160707343369722, + "rewards/final_reward": 1.3137672896730819, + "rewards/mask_iou_reward": 0.6568836448365409, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.532377541065216, + "rewards/thk_ans_format_reward": 1.0, + "step": 2126, + "think_completion_length": 43.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.640625, + "epoch": 3.5919055649241147, + "grad_norm": 14.935509325663402, + "kl": 0.60546875, + "learning_rate": 2.826306913996627e-07, + "loss": 0.0006, + "reward": 3.5516492128372192, + "reward_std": 0.07431310974061489, + "rewards/final_reward": 1.1481026905321163, + "rewards/mask_iou_reward": 0.5740513452660582, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5516492128372192, + "rewards/thk_ans_format_reward": 1.0, + "step": 2127, + "think_completion_length": 43.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 245.484375, + "epoch": 3.5935919055649244, + "grad_norm": 14.142273296969108, + "kl": 0.421875, + "learning_rate": 2.8229342327150084e-07, + "loss": 0.0004, + "reward": 3.523800492286682, + "reward_std": 0.23098544776439667, + "rewards/final_reward": 1.2040698640666694, + "rewards/mask_iou_reward": 0.6020349320333347, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.555050551891327, + "rewards/thk_ans_format_reward": 0.984375, + "step": 2128, + "think_completion_length": 46.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.9375, + "epoch": 3.5952782462057336, + "grad_norm": 15.506942761325954, + "kl": 0.541015625, + "learning_rate": 2.8195615514333893e-07, + "loss": 0.0005, + "reward": 2.981836199760437, + "reward_std": 0.05073126032948494, + "rewards/final_reward": 1.829483273128682, + "rewards/mask_iou_reward": 0.914741636564341, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.981836199760437, + "rewards/thk_ans_format_reward": 1.0, + "step": 2129, + "think_completion_length": 44.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.84375, + "epoch": 3.5969645868465427, + "grad_norm": 13.470969997373983, + "kl": 0.55078125, + "learning_rate": 2.8161888701517707e-07, + "loss": 0.0005, + "reward": 3.1844255924224854, + "reward_std": 0.09609784232452512, + "rewards/final_reward": 1.4231125763751638, + "rewards/mask_iou_reward": 0.7115562881875819, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.200050711631775, + "rewards/thk_ans_format_reward": 0.984375, + "step": 2130, + "think_completion_length": 45.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.03125, + "epoch": 3.5986509274873524, + "grad_norm": 4.3856864863237845, + "kl": 0.568359375, + "learning_rate": 2.8128161888701516e-07, + "loss": 0.0005, + "reward": 3.5685439109802246, + "reward_std": 0.07929562008939683, + "rewards/final_reward": 1.2761401732846926, + "rewards/mask_iou_reward": 0.6380700866423463, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.568543791770935, + "rewards/thk_ans_format_reward": 1.0, + "step": 2131, + "think_completion_length": 46.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.1875, + "epoch": 3.600337268128162, + "grad_norm": 5.688163167135513, + "kl": 0.595703125, + "learning_rate": 2.8094435075885325e-07, + "loss": 0.0006, + "reward": 2.721096992492676, + "reward_std": 0.0583833334967494, + "rewards/final_reward": 0.7634238064732274, + "rewards/mask_iou_reward": 0.3817119032366137, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7210970818996429, + "rewards/thk_ans_format_reward": 1.0, + "step": 2132, + "think_completion_length": 44.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.65625, + "epoch": 3.602023608768971, + "grad_norm": 5.832818627047222, + "kl": 0.52734375, + "learning_rate": 2.806070826306914e-07, + "loss": 0.0005, + "reward": 3.522038459777832, + "reward_std": 0.04175010113976896, + "rewards/final_reward": 1.7146074397476556, + "rewards/mask_iou_reward": 0.8573037198738278, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5220386385917664, + "rewards/thk_ans_format_reward": 1.0, + "step": 2133, + "think_completion_length": 44.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.65625, + "epoch": 3.603709949409781, + "grad_norm": 27.303612577890018, + "kl": 0.599609375, + "learning_rate": 2.8026981450252953e-07, + "loss": 0.0006, + "reward": 3.473210334777832, + "reward_std": 0.07562324404716492, + "rewards/final_reward": 1.753751732934803, + "rewards/mask_iou_reward": 0.8768758664674015, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.473210334777832, + "rewards/thk_ans_format_reward": 1.0, + "step": 2134, + "think_completion_length": 45.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.40625, + "epoch": 3.6053962900505905, + "grad_norm": 6.878462017998155, + "kl": 0.5546875, + "learning_rate": 2.7993254637436756e-07, + "loss": 0.0006, + "reward": 3.054173469543457, + "reward_std": 0.2945948615670204, + "rewards/final_reward": 0.9387926834905617, + "rewards/mask_iou_reward": 0.46939634174528083, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.085423469543457, + "rewards/thk_ans_format_reward": 0.984375, + "step": 2135, + "think_completion_length": 47.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.90625, + "epoch": 3.6070826306913997, + "grad_norm": 10.653031644305647, + "kl": 0.5625, + "learning_rate": 2.795952782462057e-07, + "loss": 0.0006, + "reward": 3.5420931577682495, + "reward_std": 0.0636497251689434, + "rewards/final_reward": 1.819818398576865, + "rewards/mask_iou_reward": 0.9099091992884325, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5420931577682495, + "rewards/thk_ans_format_reward": 1.0, + "step": 2136, + "think_completion_length": 46.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.890625, + "epoch": 3.608768971332209, + "grad_norm": 10.843170586952544, + "kl": 0.6015625, + "learning_rate": 2.7925801011804384e-07, + "loss": 0.0006, + "reward": 3.737997055053711, + "reward_std": 0.013210067059844732, + "rewards/final_reward": 1.795223586591173, + "rewards/mask_iou_reward": 0.8976117932955865, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7379971742630005, + "rewards/thk_ans_format_reward": 1.0, + "step": 2137, + "think_completion_length": 41.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.640625, + "epoch": 3.6104553119730185, + "grad_norm": 16.198534079929008, + "kl": 0.546875, + "learning_rate": 2.78920741989882e-07, + "loss": 0.0005, + "reward": 3.37510347366333, + "reward_std": 0.06835968187078834, + "rewards/final_reward": 1.3537600217636836, + "rewards/mask_iou_reward": 0.6768800108818418, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3751035332679749, + "rewards/thk_ans_format_reward": 1.0, + "step": 2138, + "think_completion_length": 45.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.03125, + "epoch": 3.612141652613828, + "grad_norm": 5.91434083685604, + "kl": 0.5859375, + "learning_rate": 2.7858347386172e-07, + "loss": 0.0006, + "reward": 3.16507625579834, + "reward_std": 0.0929767694324255, + "rewards/final_reward": 1.3173989301904212, + "rewards/mask_iou_reward": 0.6586994650952106, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1650761365890503, + "rewards/thk_ans_format_reward": 1.0, + "step": 2139, + "think_completion_length": 42.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.28125, + "epoch": 3.6138279932546373, + "grad_norm": 8.373994110782936, + "kl": 0.54296875, + "learning_rate": 2.7824620573355816e-07, + "loss": 0.0005, + "reward": 3.7722244262695312, + "reward_std": 0.06805156078189611, + "rewards/final_reward": 1.8827298616919066, + "rewards/mask_iou_reward": 0.9413649308459533, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7722243666648865, + "rewards/thk_ans_format_reward": 1.0, + "step": 2140, + "think_completion_length": 40.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.53125, + "epoch": 3.615514333895447, + "grad_norm": 17.268896812503602, + "kl": 0.556640625, + "learning_rate": 2.779089376053963e-07, + "loss": 0.0006, + "reward": 2.971988797187805, + "reward_std": 0.08881898410618305, + "rewards/final_reward": 0.6324427287793092, + "rewards/mask_iou_reward": 0.3162213643896546, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.97198885679245, + "rewards/thk_ans_format_reward": 1.0, + "step": 2141, + "think_completion_length": 48.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.65625, + "epoch": 3.6172006745362566, + "grad_norm": 6.8642257578860955, + "kl": 0.609375, + "learning_rate": 2.775716694772344e-07, + "loss": 0.0006, + "reward": 3.305831551551819, + "reward_std": 0.19308728724718094, + "rewards/final_reward": 0.966853426385776, + "rewards/mask_iou_reward": 0.483426713192888, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3058315515518188, + "rewards/thk_ans_format_reward": 1.0, + "step": 2142, + "think_completion_length": 43.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.609375, + "epoch": 3.618887015177066, + "grad_norm": 13.044769944871176, + "kl": 0.4228515625, + "learning_rate": 2.7723440134907247e-07, + "loss": 0.0004, + "reward": 3.3725874423980713, + "reward_std": 0.1456987876445055, + "rewards/final_reward": 1.3800859115573243, + "rewards/mask_iou_reward": 0.6900429557786621, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3725875616073608, + "rewards/thk_ans_format_reward": 1.0, + "step": 2143, + "think_completion_length": 43.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.4375, + "epoch": 3.620573355817875, + "grad_norm": 12.654651665103493, + "kl": 0.5703125, + "learning_rate": 2.768971332209106e-07, + "loss": 0.0006, + "reward": 3.2385579347610474, + "reward_std": 0.20924285799264908, + "rewards/final_reward": 1.1544221282593377, + "rewards/mask_iou_reward": 0.5772110641296688, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2385579347610474, + "rewards/thk_ans_format_reward": 1.0, + "step": 2144, + "think_completion_length": 38.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.78125, + "epoch": 3.6222596964586846, + "grad_norm": 9.375804256565166, + "kl": 0.66015625, + "learning_rate": 2.765598650927487e-07, + "loss": 0.0007, + "reward": 3.3023223876953125, + "reward_std": 0.09674730151891708, + "rewards/final_reward": 1.332632176942337, + "rewards/mask_iou_reward": 0.6663160884711685, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3023223280906677, + "rewards/thk_ans_format_reward": 1.0, + "step": 2145, + "think_completion_length": 43.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.78125, + "epoch": 3.6239460370994943, + "grad_norm": 77.38104026847395, + "kl": 0.5703125, + "learning_rate": 2.7622259696458684e-07, + "loss": 0.0006, + "reward": 3.5058757066726685, + "reward_std": 0.0840764888562262, + "rewards/final_reward": 1.5100866179690322, + "rewards/mask_iou_reward": 0.7550433089845161, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5058757662773132, + "rewards/thk_ans_format_reward": 1.0, + "step": 2146, + "think_completion_length": 45.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.890625, + "epoch": 3.6256323777403034, + "grad_norm": 13.735014321000364, + "kl": 0.564453125, + "learning_rate": 2.75885328836425e-07, + "loss": 0.0006, + "reward": 3.601733446121216, + "reward_std": 0.02019692724570632, + "rewards/final_reward": 1.3094032991992814, + "rewards/mask_iou_reward": 0.6547016495996407, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6017334461212158, + "rewards/thk_ans_format_reward": 1.0, + "step": 2147, + "think_completion_length": 46.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.625, + "epoch": 3.627318718381113, + "grad_norm": 6.036157171143187, + "kl": 0.50390625, + "learning_rate": 2.75548060708263e-07, + "loss": 0.0005, + "reward": 3.8341445922851562, + "reward_std": 0.07307778589893132, + "rewards/final_reward": 1.8001251496910746, + "rewards/mask_iou_reward": 0.9000625748455373, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8341445326805115, + "rewards/thk_ans_format_reward": 1.0, + "step": 2148, + "think_completion_length": 46.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.734375, + "epoch": 3.6290050590219223, + "grad_norm": 7.463962254660814, + "kl": 0.5625, + "learning_rate": 2.7521079258010116e-07, + "loss": 0.0005, + "reward": 3.830557703971863, + "reward_std": 0.0030419373651966453, + "rewards/final_reward": 1.9596382233535177, + "rewards/mask_iou_reward": 0.9798191116767588, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8305577635765076, + "rewards/thk_ans_format_reward": 1.0, + "step": 2149, + "think_completion_length": 44.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.140625, + "epoch": 3.630691399662732, + "grad_norm": 18.007528140348732, + "kl": 0.513671875, + "learning_rate": 2.748735244519393e-07, + "loss": 0.0005, + "reward": 3.420504570007324, + "reward_std": 0.07008447870612144, + "rewards/final_reward": 1.7162632921924779, + "rewards/mask_iou_reward": 0.8581316460962389, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4205045104026794, + "rewards/thk_ans_format_reward": 1.0, + "step": 2150, + "think_completion_length": 47.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.453125, + "epoch": 3.632377740303541, + "grad_norm": 6.8912677980775126, + "kl": 0.57421875, + "learning_rate": 2.7453625632377744e-07, + "loss": 0.0006, + "reward": 3.775565028190613, + "reward_std": 0.05060883052647114, + "rewards/final_reward": 1.847828195153058, + "rewards/mask_iou_reward": 0.923914097576529, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7755651473999023, + "rewards/thk_ans_format_reward": 1.0, + "step": 2151, + "think_completion_length": 43.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.171875, + "epoch": 3.6340640809443507, + "grad_norm": 10.819998287532302, + "kl": 0.5234375, + "learning_rate": 2.7419898819561547e-07, + "loss": 0.0005, + "reward": 3.4836515188217163, + "reward_std": 0.36593368649482727, + "rewards/final_reward": 1.224108838536167, + "rewards/mask_iou_reward": 0.6120544192680835, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4836515188217163, + "rewards/thk_ans_format_reward": 1.0, + "step": 2152, + "think_completion_length": 42.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.515625, + "epoch": 3.6357504215851604, + "grad_norm": 6.744144134909996, + "kl": 0.53125, + "learning_rate": 2.738617200674536e-07, + "loss": 0.0005, + "reward": 3.1626614332199097, + "reward_std": 0.12014555651694536, + "rewards/final_reward": 1.2144468510600666, + "rewards/mask_iou_reward": 0.6072234255300333, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.162661463022232, + "rewards/thk_ans_format_reward": 1.0, + "step": 2153, + "think_completion_length": 40.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.03125, + "epoch": 3.6374367622259696, + "grad_norm": 26.616278710272272, + "kl": 0.564453125, + "learning_rate": 2.7352445193929175e-07, + "loss": 0.0006, + "reward": 2.7842090129852295, + "reward_std": 0.33606940880417824, + "rewards/final_reward": 0.5275331489239576, + "rewards/mask_iou_reward": 0.2637665744619788, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7842088639736176, + "rewards/thk_ans_format_reward": 1.0, + "step": 2154, + "think_completion_length": 41.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.0, + "epoch": 3.639123102866779, + "grad_norm": 10.687676166681745, + "kl": 0.474609375, + "learning_rate": 2.7318718381112984e-07, + "loss": 0.0005, + "reward": 3.517017364501953, + "reward_std": 0.2570660449564457, + "rewards/final_reward": 1.442429098155051, + "rewards/mask_iou_reward": 0.7212145490775255, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.5482673048973083, + "rewards/thk_ans_format_reward": 0.984375, + "step": 2155, + "think_completion_length": 39.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.484375, + "epoch": 3.6408094435075884, + "grad_norm": 13.233145865190844, + "kl": 0.587890625, + "learning_rate": 2.7284991568296793e-07, + "loss": 0.0006, + "reward": 3.3401609659194946, + "reward_std": 0.13841368909925222, + "rewards/final_reward": 0.8899339333062407, + "rewards/mask_iou_reward": 0.44496696665312035, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.3714109063148499, + "rewards/thk_ans_format_reward": 0.984375, + "step": 2156, + "think_completion_length": 44.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.9375, + "epoch": 3.642495784148398, + "grad_norm": 6.341052709486836, + "kl": 0.61328125, + "learning_rate": 2.7251264755480607e-07, + "loss": 0.0007, + "reward": 2.9961708784103394, + "reward_std": 0.06950226402841508, + "rewards/final_reward": 1.7549071371048877, + "rewards/mask_iou_reward": 0.8774535685524438, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9961709380149841, + "rewards/thk_ans_format_reward": 1.0, + "step": 2157, + "think_completion_length": 44.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.359375, + "epoch": 3.6441821247892072, + "grad_norm": 9.988799852208986, + "kl": 0.54296875, + "learning_rate": 2.7217537942664415e-07, + "loss": 0.0005, + "reward": 3.432819962501526, + "reward_std": 0.12654725369066, + "rewards/final_reward": 1.3642316455310084, + "rewards/mask_iou_reward": 0.6821158227655042, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.432819902896881, + "rewards/thk_ans_format_reward": 1.0, + "step": 2158, + "think_completion_length": 44.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.859375, + "epoch": 3.645868465430017, + "grad_norm": 9.507871004604434, + "kl": 0.572265625, + "learning_rate": 2.718381112984823e-07, + "loss": 0.0006, + "reward": 3.5598703622817993, + "reward_std": 0.06046362966299057, + "rewards/final_reward": 1.534753833970691, + "rewards/mask_iou_reward": 0.7673769169853455, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5598702430725098, + "rewards/thk_ans_format_reward": 1.0, + "step": 2159, + "think_completion_length": 41.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.15625, + "epoch": 3.6475548060708265, + "grad_norm": 16.92326930924938, + "kl": 0.556640625, + "learning_rate": 2.715008431703204e-07, + "loss": 0.0006, + "reward": 3.11311411857605, + "reward_std": 0.11794350296258926, + "rewards/final_reward": 1.8428080052810945, + "rewards/mask_iou_reward": 0.9214040026405472, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1131139993667603, + "rewards/thk_ans_format_reward": 1.0, + "step": 2160, + "think_completion_length": 41.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.125, + "epoch": 3.6492411467116357, + "grad_norm": 107.29921425149882, + "kl": 0.53515625, + "learning_rate": 2.7116357504215847e-07, + "loss": 0.0005, + "reward": 3.799378275871277, + "reward_std": 0.042371081188321114, + "rewards/final_reward": 1.7386770075593936, + "rewards/mask_iou_reward": 0.8693385037796968, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7993780970573425, + "rewards/thk_ans_format_reward": 1.0, + "step": 2161, + "think_completion_length": 44.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.140625, + "epoch": 3.6509274873524453, + "grad_norm": 5.846962924465535, + "kl": 0.564453125, + "learning_rate": 2.708263069139966e-07, + "loss": 0.0005, + "reward": 3.193526268005371, + "reward_std": 0.2762054353952408, + "rewards/final_reward": 1.193118101721124, + "rewards/mask_iou_reward": 0.596559050860562, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.2091514766216278, + "rewards/thk_ans_format_reward": 1.0, + "step": 2162, + "think_completion_length": 44.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.25, + "epoch": 3.6526138279932545, + "grad_norm": 8.83884476539582, + "kl": 0.54296875, + "learning_rate": 2.7048903878583475e-07, + "loss": 0.0005, + "reward": 3.3878525495529175, + "reward_std": 0.04130503349006176, + "rewards/final_reward": 1.0849541941114242, + "rewards/mask_iou_reward": 0.5424770970557121, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3878525495529175, + "rewards/thk_ans_format_reward": 1.0, + "step": 2163, + "think_completion_length": 43.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.65625, + "epoch": 3.654300168634064, + "grad_norm": 7.364941555058796, + "kl": 0.62109375, + "learning_rate": 2.7015177065767284e-07, + "loss": 0.0006, + "reward": 3.138849139213562, + "reward_std": 0.16283194720745087, + "rewards/final_reward": 1.3491245156995535, + "rewards/mask_iou_reward": 0.6745622578497767, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1388492584228516, + "rewards/thk_ans_format_reward": 1.0, + "step": 2164, + "think_completion_length": 42.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.296875, + "epoch": 3.6559865092748733, + "grad_norm": 7.3929923391392975, + "kl": 0.515625, + "learning_rate": 2.698145025295109e-07, + "loss": 0.0005, + "reward": 3.4643598794937134, + "reward_std": 0.03516603959724307, + "rewards/final_reward": 1.3184992668719937, + "rewards/mask_iou_reward": 0.6592496334359969, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4643598198890686, + "rewards/thk_ans_format_reward": 1.0, + "step": 2165, + "think_completion_length": 43.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.234375, + "epoch": 3.657672849915683, + "grad_norm": 9.281403489514085, + "kl": 0.5859375, + "learning_rate": 2.6947723440134907e-07, + "loss": 0.0006, + "reward": 3.5559327602386475, + "reward_std": 0.0707951420918107, + "rewards/final_reward": 1.3471013670947625, + "rewards/mask_iou_reward": 0.6735506835473812, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5559325814247131, + "rewards/thk_ans_format_reward": 1.0, + "step": 2166, + "think_completion_length": 45.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.125, + "epoch": 3.6593591905564926, + "grad_norm": 9.020458741884312, + "kl": 0.58203125, + "learning_rate": 2.691399662731872e-07, + "loss": 0.0006, + "reward": 3.6217806339263916, + "reward_std": 0.22465556859970093, + "rewards/final_reward": 1.7130859668021712, + "rewards/mask_iou_reward": 0.8565429834010856, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6217804551124573, + "rewards/thk_ans_format_reward": 1.0, + "step": 2167, + "think_completion_length": 44.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.15625, + "epoch": 3.661045531197302, + "grad_norm": 33.60697679015086, + "kl": 0.703125, + "learning_rate": 2.6880269814502524e-07, + "loss": 0.0007, + "reward": 3.453278422355652, + "reward_std": 0.10349838621914387, + "rewards/final_reward": 1.295618368161848, + "rewards/mask_iou_reward": 0.647809184080924, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4532784223556519, + "rewards/thk_ans_format_reward": 1.0, + "step": 2168, + "think_completion_length": 44.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.484375, + "epoch": 3.6627318718381114, + "grad_norm": 20.92072593765282, + "kl": 0.837890625, + "learning_rate": 2.684654300168634e-07, + "loss": 0.0008, + "reward": 3.924134373664856, + "reward_std": 0.006402852479368448, + "rewards/final_reward": 1.9268632408703417, + "rewards/mask_iou_reward": 0.9634316204351708, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.924134373664856, + "rewards/thk_ans_format_reward": 1.0, + "step": 2169, + "think_completion_length": 40.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.078125, + "epoch": 3.6644182124789206, + "grad_norm": 10.85155357900207, + "kl": 0.583984375, + "learning_rate": 2.681281618887015e-07, + "loss": 0.0006, + "reward": 3.402068257331848, + "reward_std": 0.025438982993364334, + "rewards/final_reward": 1.6983160717437744, + "rewards/mask_iou_reward": 0.8491580358718872, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4020682573318481, + "rewards/thk_ans_format_reward": 1.0, + "step": 2170, + "think_completion_length": 43.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.453125, + "epoch": 3.6661045531197303, + "grad_norm": 6.932460295196636, + "kl": 0.6640625, + "learning_rate": 2.677908937605396e-07, + "loss": 0.0007, + "reward": 3.6236302852630615, + "reward_std": 0.2714125607162714, + "rewards/final_reward": 1.7773782817528996, + "rewards/mask_iou_reward": 0.8886891408764498, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6236302852630615, + "rewards/thk_ans_format_reward": 1.0, + "step": 2171, + "think_completion_length": 45.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.15625, + "epoch": 3.6677908937605395, + "grad_norm": 8.724160078814055, + "kl": 0.68359375, + "learning_rate": 2.674536256323777e-07, + "loss": 0.0007, + "reward": 3.245076298713684, + "reward_std": 0.17124823480844498, + "rewards/final_reward": 1.6967053509231722, + "rewards/mask_iou_reward": 0.8483526754615861, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.245076298713684, + "rewards/thk_ans_format_reward": 1.0, + "step": 2172, + "think_completion_length": 44.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.140625, + "epoch": 3.669477234401349, + "grad_norm": 10.374725265579276, + "kl": 0.6015625, + "learning_rate": 2.6711635750421584e-07, + "loss": 0.0006, + "reward": 3.354717493057251, + "reward_std": 0.0516207218170166, + "rewards/final_reward": 1.6412412625450845, + "rewards/mask_iou_reward": 0.8206206312725423, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.354717493057251, + "rewards/thk_ans_format_reward": 1.0, + "step": 2173, + "think_completion_length": 45.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.359375, + "epoch": 3.6711635750421587, + "grad_norm": 10.050664184936608, + "kl": 0.5703125, + "learning_rate": 2.66779089376054e-07, + "loss": 0.0006, + "reward": 3.3863418102264404, + "reward_std": 0.05082100164145231, + "rewards/final_reward": 1.3180649394657395, + "rewards/mask_iou_reward": 0.6590324697328698, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3863418102264404, + "rewards/thk_ans_format_reward": 1.0, + "step": 2174, + "think_completion_length": 45.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.484375, + "epoch": 3.672849915682968, + "grad_norm": 12.769801192651933, + "kl": 0.50390625, + "learning_rate": 2.6644182124789206e-07, + "loss": 0.0005, + "reward": 3.61961829662323, + "reward_std": 0.2114700749516487, + "rewards/final_reward": 1.8251791289417982, + "rewards/mask_iou_reward": 0.9125895644708991, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6196181774139404, + "rewards/thk_ans_format_reward": 1.0, + "step": 2175, + "think_completion_length": 45.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.203125, + "epoch": 3.6745362563237776, + "grad_norm": 17.69803214425245, + "kl": 0.56640625, + "learning_rate": 2.661045531197302e-07, + "loss": 0.0006, + "reward": 3.3327629566192627, + "reward_std": 0.2733706757426262, + "rewards/final_reward": 1.1473787205170702, + "rewards/mask_iou_reward": 0.5736893602585351, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3327630162239075, + "rewards/thk_ans_format_reward": 1.0, + "step": 2176, + "think_completion_length": 40.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.328125, + "epoch": 3.6762225969645868, + "grad_norm": 9.607761168239486, + "kl": 0.55078125, + "learning_rate": 2.657672849915683e-07, + "loss": 0.0006, + "reward": 3.2653400897979736, + "reward_std": 0.3390034884214401, + "rewards/final_reward": 0.8602963584102136, + "rewards/mask_iou_reward": 0.4301481792051068, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2653402090072632, + "rewards/thk_ans_format_reward": 1.0, + "step": 2177, + "think_completion_length": 44.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.09375, + "epoch": 3.6779089376053964, + "grad_norm": 7.861388433142454, + "kl": 0.470703125, + "learning_rate": 2.654300168634064e-07, + "loss": 0.0005, + "reward": 3.666589379310608, + "reward_std": 0.0676095001399517, + "rewards/final_reward": 1.5565185785082936, + "rewards/mask_iou_reward": 0.7782592892541468, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6665893197059631, + "rewards/thk_ans_format_reward": 1.0, + "step": 2178, + "think_completion_length": 44.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.765625, + "epoch": 3.6795952782462056, + "grad_norm": 6.506833566530017, + "kl": 0.5546875, + "learning_rate": 2.650927487352445e-07, + "loss": 0.0006, + "reward": 3.3174102306365967, + "reward_std": 0.09125644341111183, + "rewards/final_reward": 1.274266532450679, + "rewards/mask_iou_reward": 0.6371332662253395, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3174102306365967, + "rewards/thk_ans_format_reward": 1.0, + "step": 2179, + "think_completion_length": 41.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.859375, + "epoch": 3.681281618887015, + "grad_norm": 16.964483085595624, + "kl": 0.56640625, + "learning_rate": 2.6475548060708266e-07, + "loss": 0.0006, + "reward": 3.723142385482788, + "reward_std": 0.08161863312125206, + "rewards/final_reward": 1.7507960958907227, + "rewards/mask_iou_reward": 0.8753980479453614, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7231422066688538, + "rewards/thk_ans_format_reward": 1.0, + "step": 2180, + "think_completion_length": 45.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.96875, + "epoch": 3.682967959527825, + "grad_norm": 8.963115232354015, + "kl": 0.5625, + "learning_rate": 2.644182124789207e-07, + "loss": 0.0006, + "reward": 2.962601661682129, + "reward_std": 0.1202041245996952, + "rewards/final_reward": 1.2414144244462615, + "rewards/mask_iou_reward": 0.6207072122231307, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9626017510890961, + "rewards/thk_ans_format_reward": 1.0, + "step": 2181, + "think_completion_length": 44.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.953125, + "epoch": 3.684654300168634, + "grad_norm": 7.8568867421636375, + "kl": 0.5703125, + "learning_rate": 2.6408094435075883e-07, + "loss": 0.0006, + "reward": 3.705932855606079, + "reward_std": 0.15980882477015257, + "rewards/final_reward": 1.6566305434563295, + "rewards/mask_iou_reward": 0.8283152717281648, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7059327960014343, + "rewards/thk_ans_format_reward": 1.0, + "step": 2182, + "think_completion_length": 38.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.078125, + "epoch": 3.6863406408094432, + "grad_norm": 26.061102769842314, + "kl": 0.55078125, + "learning_rate": 2.63743676222597e-07, + "loss": 0.0005, + "reward": 3.373674750328064, + "reward_std": 0.23157277703285217, + "rewards/final_reward": 1.3468981540470226, + "rewards/mask_iou_reward": 0.6734490770235113, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.373674750328064, + "rewards/thk_ans_format_reward": 1.0, + "step": 2183, + "think_completion_length": 46.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.046875, + "epoch": 3.688026981450253, + "grad_norm": 6.378374390223215, + "kl": 0.56640625, + "learning_rate": 2.6340640809443506e-07, + "loss": 0.0006, + "reward": 3.674358606338501, + "reward_std": 0.19893009960651398, + "rewards/final_reward": 1.5619072067565283, + "rewards/mask_iou_reward": 0.7809536033782641, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.674358606338501, + "rewards/thk_ans_format_reward": 1.0, + "step": 2184, + "think_completion_length": 41.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.375, + "epoch": 3.6897133220910625, + "grad_norm": 5.631948058301189, + "kl": 0.5625, + "learning_rate": 2.6306913996627315e-07, + "loss": 0.0006, + "reward": 2.777292251586914, + "reward_std": 0.03504425939172506, + "rewards/final_reward": 0.5667150524643376, + "rewards/mask_iou_reward": 0.2833575262321688, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7772921919822693, + "rewards/thk_ans_format_reward": 1.0, + "step": 2185, + "think_completion_length": 43.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.890625, + "epoch": 3.6913996627318717, + "grad_norm": 6.357830145895411, + "kl": 0.560546875, + "learning_rate": 2.627318718381113e-07, + "loss": 0.0006, + "reward": 3.7768133878707886, + "reward_std": 0.148924196138978, + "rewards/final_reward": 1.7280991344863805, + "rewards/mask_iou_reward": 0.8640495672431903, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7768135070800781, + "rewards/thk_ans_format_reward": 1.0, + "step": 2186, + "think_completion_length": 35.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.046875, + "epoch": 3.6930860033726813, + "grad_norm": 17.68862850386764, + "kl": 0.54296875, + "learning_rate": 2.6239460370994943e-07, + "loss": 0.0006, + "reward": 3.8915481567382812, + "reward_std": 0.004744681587908417, + "rewards/final_reward": 1.9137755741846734, + "rewards/mask_iou_reward": 0.9568877870923367, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.891547977924347, + "rewards/thk_ans_format_reward": 1.0, + "step": 2187, + "think_completion_length": 42.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.5625, + "epoch": 3.694772344013491, + "grad_norm": 6.4891286437355395, + "kl": 0.537109375, + "learning_rate": 2.620573355817875e-07, + "loss": 0.0005, + "reward": 3.456454873085022, + "reward_std": 0.15797370299696922, + "rewards/final_reward": 1.5738219841745664, + "rewards/mask_iou_reward": 0.7869109920872832, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4564548134803772, + "rewards/thk_ans_format_reward": 1.0, + "step": 2188, + "think_completion_length": 42.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.28125, + "epoch": 3.6964586846543, + "grad_norm": 7.514792783715896, + "kl": 0.5205078125, + "learning_rate": 2.617200674536256e-07, + "loss": 0.0005, + "reward": 3.180192708969116, + "reward_std": 0.034278427017852664, + "rewards/final_reward": 0.8698045692364691, + "rewards/mask_iou_reward": 0.43490228461823455, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1801927089691162, + "rewards/thk_ans_format_reward": 1.0, + "step": 2189, + "think_completion_length": 43.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.21875, + "epoch": 3.6981450252951094, + "grad_norm": 6.422281694154933, + "kl": 0.56640625, + "learning_rate": 2.6138279932546375e-07, + "loss": 0.0006, + "reward": 3.884060502052307, + "reward_std": 0.0026781876804307103, + "rewards/final_reward": 1.8500053453206569, + "rewards/mask_iou_reward": 0.9250026726603284, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8840603828430176, + "rewards/thk_ans_format_reward": 1.0, + "step": 2190, + "think_completion_length": 40.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.625, + "epoch": 3.699831365935919, + "grad_norm": 9.07015849728561, + "kl": 0.59375, + "learning_rate": 2.6104553119730183e-07, + "loss": 0.0006, + "reward": 3.3919734954833984, + "reward_std": 0.049376328475773335, + "rewards/final_reward": 1.2875744224146906, + "rewards/mask_iou_reward": 0.6437872112073453, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.391973614692688, + "rewards/thk_ans_format_reward": 1.0, + "step": 2191, + "think_completion_length": 43.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.125, + "epoch": 3.7015177065767286, + "grad_norm": 36.21098255396208, + "kl": 0.57421875, + "learning_rate": 2.6070826306913997e-07, + "loss": 0.0006, + "reward": 3.5568835735321045, + "reward_std": 0.08409557677805424, + "rewards/final_reward": 1.5891457329541123, + "rewards/mask_iou_reward": 0.7945728664770562, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.556883454322815, + "rewards/thk_ans_format_reward": 1.0, + "step": 2192, + "think_completion_length": 42.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.671875, + "epoch": 3.703204047217538, + "grad_norm": 6.378184318212264, + "kl": 0.54296875, + "learning_rate": 2.6037099494097806e-07, + "loss": 0.0005, + "reward": 3.353445529937744, + "reward_std": 0.004928447189740837, + "rewards/final_reward": 1.7434356803338902, + "rewards/mask_iou_reward": 0.8717178401669451, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3534456193447113, + "rewards/thk_ans_format_reward": 1.0, + "step": 2193, + "think_completion_length": 39.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.046875, + "epoch": 3.7048903878583475, + "grad_norm": 31.98309447729436, + "kl": 0.59765625, + "learning_rate": 2.6003372681281615e-07, + "loss": 0.0006, + "reward": 3.554110288619995, + "reward_std": 0.0832400880753994, + "rewards/final_reward": 1.1983655867542573, + "rewards/mask_iou_reward": 0.5991827933771287, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5541102886199951, + "rewards/thk_ans_format_reward": 1.0, + "step": 2194, + "think_completion_length": 40.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.96875, + "epoch": 3.706576728499157, + "grad_norm": 16.719532672014108, + "kl": 0.591796875, + "learning_rate": 2.596964586846543e-07, + "loss": 0.0005, + "reward": 3.358941078186035, + "reward_std": 0.03364470507949591, + "rewards/final_reward": 1.0305207266722358, + "rewards/mask_iou_reward": 0.5152603633361179, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.35894113779068, + "rewards/thk_ans_format_reward": 1.0, + "step": 2195, + "think_completion_length": 36.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.15625, + "epoch": 3.7082630691399663, + "grad_norm": 9.148044836164983, + "kl": 0.494140625, + "learning_rate": 2.5935919055649243e-07, + "loss": 0.0005, + "reward": 3.2433149814605713, + "reward_std": 0.3317317571491003, + "rewards/final_reward": 1.6999731855188098, + "rewards/mask_iou_reward": 0.8499865927594049, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2433151006698608, + "rewards/thk_ans_format_reward": 1.0, + "step": 2196, + "think_completion_length": 39.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.71875, + "epoch": 3.7099494097807755, + "grad_norm": 11.166676327464593, + "kl": 0.564453125, + "learning_rate": 2.5902192242833046e-07, + "loss": 0.0006, + "reward": 3.205594062805176, + "reward_std": 0.14426006376743317, + "rewards/final_reward": 1.4832082424943116, + "rewards/mask_iou_reward": 0.7416041212471558, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.205594152212143, + "rewards/thk_ans_format_reward": 1.0, + "step": 2197, + "think_completion_length": 42.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.875, + "epoch": 3.711635750421585, + "grad_norm": 17.63438941915533, + "kl": 0.619140625, + "learning_rate": 2.586846543001686e-07, + "loss": 0.0006, + "reward": 3.4435391426086426, + "reward_std": 0.04925878718495369, + "rewards/final_reward": 1.4857788899066038, + "rewards/mask_iou_reward": 0.7428894449533019, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4435392022132874, + "rewards/thk_ans_format_reward": 1.0, + "step": 2198, + "think_completion_length": 39.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.265625, + "epoch": 3.7133220910623947, + "grad_norm": 22.065547912115726, + "kl": 0.53515625, + "learning_rate": 2.5834738617200674e-07, + "loss": 0.0005, + "reward": 3.2958098649978638, + "reward_std": 0.0769207589328289, + "rewards/final_reward": 1.3118947208677025, + "rewards/mask_iou_reward": 0.6559473604338513, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.295809805393219, + "rewards/thk_ans_format_reward": 1.0, + "step": 2199, + "think_completion_length": 38.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.0625, + "epoch": 3.715008431703204, + "grad_norm": 9.77059697873224, + "kl": 0.6171875, + "learning_rate": 2.580101180438449e-07, + "loss": 0.0006, + "reward": 3.5691983699798584, + "reward_std": 0.17080958746373653, + "rewards/final_reward": 1.6634956217435797, + "rewards/mask_iou_reward": 0.8317478108717898, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5691982507705688, + "rewards/thk_ans_format_reward": 1.0, + "step": 2200, + "think_completion_length": 40.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.078125, + "epoch": 3.7166947723440136, + "grad_norm": 6.208268504324163, + "kl": 0.5556640625, + "learning_rate": 2.576728499156829e-07, + "loss": 0.0006, + "reward": 3.8266072273254395, + "reward_std": 0.15711436793208122, + "rewards/final_reward": 1.767159090919887, + "rewards/mask_iou_reward": 0.8835795454599435, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8266071677207947, + "rewards/thk_ans_format_reward": 1.0, + "step": 2201, + "think_completion_length": 42.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.171875, + "epoch": 3.718381112984823, + "grad_norm": 8.694354037283233, + "kl": 0.662109375, + "learning_rate": 2.5733558178752106e-07, + "loss": 0.0007, + "reward": 3.179361581802368, + "reward_std": 0.030320387333631516, + "rewards/final_reward": 1.3028335589093758, + "rewards/mask_iou_reward": 0.6514167794546879, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.17936173081398, + "rewards/thk_ans_format_reward": 1.0, + "step": 2202, + "think_completion_length": 36.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.0625, + "epoch": 3.7200674536256324, + "grad_norm": 12.425460690779902, + "kl": 0.49609375, + "learning_rate": 2.569983136593592e-07, + "loss": 0.0005, + "reward": 3.647360920906067, + "reward_std": 0.06752203544601798, + "rewards/final_reward": 1.86081172540908, + "rewards/mask_iou_reward": 0.93040586270454, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6473609805107117, + "rewards/thk_ans_format_reward": 1.0, + "step": 2203, + "think_completion_length": 38.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.796875, + "epoch": 3.7217537942664416, + "grad_norm": 20.256036715233673, + "kl": 0.5859375, + "learning_rate": 2.566610455311973e-07, + "loss": 0.0006, + "reward": 3.146657943725586, + "reward_std": 0.23980345856398344, + "rewards/final_reward": 1.6253581227571177, + "rewards/mask_iou_reward": 0.8126790613785588, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.146657943725586, + "rewards/thk_ans_format_reward": 1.0, + "step": 2204, + "think_completion_length": 44.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.796875, + "epoch": 3.7234401349072512, + "grad_norm": 8.886069594999654, + "kl": 0.541015625, + "learning_rate": 2.5632377740303543e-07, + "loss": 0.0006, + "reward": 3.452314257621765, + "reward_std": 0.07864137506112456, + "rewards/final_reward": 1.2201046790815127, + "rewards/mask_iou_reward": 0.6100523395407563, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4523141980171204, + "rewards/thk_ans_format_reward": 1.0, + "step": 2205, + "think_completion_length": 37.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.046875, + "epoch": 3.725126475548061, + "grad_norm": 11.426640569808292, + "kl": 0.548828125, + "learning_rate": 2.559865092748735e-07, + "loss": 0.0005, + "reward": 3.317999005317688, + "reward_std": 0.25152764841914177, + "rewards/final_reward": 1.0653466622048742, + "rewards/mask_iou_reward": 0.5326733311024371, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3179990351200104, + "rewards/thk_ans_format_reward": 1.0, + "step": 2206, + "think_completion_length": 39.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.09375, + "epoch": 3.72681281618887, + "grad_norm": 8.227125714328034, + "kl": 0.490234375, + "learning_rate": 2.556492411467116e-07, + "loss": 0.0005, + "reward": 3.541482925415039, + "reward_std": 0.23475152254104614, + "rewards/final_reward": 1.9007196628561678, + "rewards/mask_iou_reward": 0.9503598314280839, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5414828062057495, + "rewards/thk_ans_format_reward": 1.0, + "step": 2207, + "think_completion_length": 38.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.53125, + "epoch": 3.7284991568296797, + "grad_norm": 8.56115780454844, + "kl": 0.595703125, + "learning_rate": 2.5531197301854974e-07, + "loss": 0.0006, + "reward": 2.894770622253418, + "reward_std": 0.37094295769929886, + "rewards/final_reward": 0.772514029249296, + "rewards/mask_iou_reward": 0.386257014624648, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8947706520557404, + "rewards/thk_ans_format_reward": 1.0, + "step": 2208, + "think_completion_length": 38.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.625, + "epoch": 3.730185497470489, + "grad_norm": 9.777623691175139, + "kl": 0.537109375, + "learning_rate": 2.549747048903879e-07, + "loss": 0.0005, + "reward": 3.601067543029785, + "reward_std": 0.06892485357820988, + "rewards/final_reward": 1.581348549336064, + "rewards/mask_iou_reward": 0.790674274668032, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6010676622390747, + "rewards/thk_ans_format_reward": 1.0, + "step": 2209, + "think_completion_length": 36.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.734375, + "epoch": 3.7318718381112985, + "grad_norm": 12.77242033038978, + "kl": 0.568359375, + "learning_rate": 2.546374367622259e-07, + "loss": 0.0006, + "reward": 3.286626100540161, + "reward_std": 0.1027615237981081, + "rewards/final_reward": 1.4901105977761426, + "rewards/mask_iou_reward": 0.7450552988880713, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2866259813308716, + "rewards/thk_ans_format_reward": 1.0, + "step": 2210, + "think_completion_length": 41.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.71875, + "epoch": 3.7335581787521077, + "grad_norm": 12.011241878322691, + "kl": 0.525390625, + "learning_rate": 2.5430016863406406e-07, + "loss": 0.0006, + "reward": 3.164780020713806, + "reward_std": 0.05755174346268177, + "rewards/final_reward": 0.8981543311624601, + "rewards/mask_iou_reward": 0.44907716558123006, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1647798418998718, + "rewards/thk_ans_format_reward": 1.0, + "step": 2211, + "think_completion_length": 39.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.03125, + "epoch": 3.7352445193929174, + "grad_norm": 8.407289252559087, + "kl": 0.576171875, + "learning_rate": 2.539629005059022e-07, + "loss": 0.0006, + "reward": 3.6150245666503906, + "reward_std": 0.27539839781820774, + "rewards/final_reward": 1.5987298278611233, + "rewards/mask_iou_reward": 0.7993649139305616, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6150245666503906, + "rewards/thk_ans_format_reward": 1.0, + "step": 2212, + "think_completion_length": 43.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.71875, + "epoch": 3.736930860033727, + "grad_norm": 4.722739695611589, + "kl": 0.556640625, + "learning_rate": 2.5362563237774034e-07, + "loss": 0.0007, + "reward": 3.00023877620697, + "reward_std": 0.08326515275985003, + "rewards/final_reward": 0.3297335486369651, + "rewards/mask_iou_reward": 0.16486677431848254, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0002387762069702, + "rewards/thk_ans_format_reward": 1.0, + "step": 2213, + "think_completion_length": 39.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.078125, + "epoch": 3.738617200674536, + "grad_norm": 12.344920387140817, + "kl": 0.6171875, + "learning_rate": 2.5328836424957837e-07, + "loss": 0.0006, + "reward": 2.7376948595046997, + "reward_std": 0.1573820672929287, + "rewards/final_reward": 0.7704514279846209, + "rewards/mask_iou_reward": 0.3852257139923104, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7376948893070221, + "rewards/thk_ans_format_reward": 1.0, + "step": 2214, + "think_completion_length": 46.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.953125, + "epoch": 3.740303541315346, + "grad_norm": 8.864111740080398, + "kl": 0.611328125, + "learning_rate": 2.529510961214165e-07, + "loss": 0.0006, + "reward": 3.425600290298462, + "reward_std": 0.09412947855889797, + "rewards/final_reward": 1.4392662550548065, + "rewards/mask_iou_reward": 0.7196331275274033, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4256003499031067, + "rewards/thk_ans_format_reward": 1.0, + "step": 2215, + "think_completion_length": 39.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.109375, + "epoch": 3.741989881956155, + "grad_norm": 9.86841034282967, + "kl": 0.640625, + "learning_rate": 2.5261382799325465e-07, + "loss": 0.0006, + "reward": 3.367143392562866, + "reward_std": 0.13670575991272926, + "rewards/final_reward": 1.6991905185967289, + "rewards/mask_iou_reward": 0.8495952592983644, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3671433925628662, + "rewards/thk_ans_format_reward": 1.0, + "step": 2216, + "think_completion_length": 39.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.9375, + "epoch": 3.7436762225969646, + "grad_norm": 7.511324151445277, + "kl": 0.6416015625, + "learning_rate": 2.5227655986509274e-07, + "loss": 0.0006, + "reward": 3.638978123664856, + "reward_std": 0.25424132496118546, + "rewards/final_reward": 1.3872458365062224, + "rewards/mask_iou_reward": 0.6936229182531112, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6389780044555664, + "rewards/thk_ans_format_reward": 1.0, + "step": 2217, + "think_completion_length": 42.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.984375, + "epoch": 3.745362563237774, + "grad_norm": 8.780601057458544, + "kl": 0.533203125, + "learning_rate": 2.5193929173693083e-07, + "loss": 0.0005, + "reward": 3.2593986988067627, + "reward_std": 0.07271349988877773, + "rewards/final_reward": 1.7570459376830176, + "rewards/mask_iou_reward": 0.8785229688415088, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2593986988067627, + "rewards/thk_ans_format_reward": 1.0, + "step": 2218, + "think_completion_length": 44.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.078125, + "epoch": 3.7470489038785835, + "grad_norm": 7.846688741217049, + "kl": 0.501953125, + "learning_rate": 2.5160202360876897e-07, + "loss": 0.0005, + "reward": 3.4012067317962646, + "reward_std": 0.22127216309309006, + "rewards/final_reward": 1.4312873112016784, + "rewards/mask_iou_reward": 0.7156436556008392, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4012067914009094, + "rewards/thk_ans_format_reward": 1.0, + "step": 2219, + "think_completion_length": 36.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.0625, + "epoch": 3.748735244519393, + "grad_norm": 7.298265056389149, + "kl": 0.55078125, + "learning_rate": 2.5126475548060706e-07, + "loss": 0.0006, + "reward": 3.504394292831421, + "reward_std": 0.18095969408750534, + "rewards/final_reward": 1.443156090383158, + "rewards/mask_iou_reward": 0.721578045191579, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5043942332267761, + "rewards/thk_ans_format_reward": 1.0, + "step": 2220, + "think_completion_length": 40.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.484375, + "epoch": 3.7504215851602023, + "grad_norm": 6.222948569178547, + "kl": 0.46875, + "learning_rate": 2.509274873524452e-07, + "loss": 0.0005, + "reward": 3.4797459840774536, + "reward_std": 0.07105998322367668, + "rewards/final_reward": 1.6921433070142375, + "rewards/mask_iou_reward": 0.8460716535071188, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.479745864868164, + "rewards/thk_ans_format_reward": 1.0, + "step": 2221, + "think_completion_length": 41.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.125, + "epoch": 3.752107925801012, + "grad_norm": 9.67562564815868, + "kl": 0.4443359375, + "learning_rate": 2.505902192242833e-07, + "loss": 0.0005, + "reward": 3.7335736751556396, + "reward_std": 0.1016400195658207, + "rewards/final_reward": 1.7593110192348576, + "rewards/mask_iou_reward": 0.8796555096174288, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7335737347602844, + "rewards/thk_ans_format_reward": 1.0, + "step": 2222, + "think_completion_length": 43.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.40625, + "epoch": 3.753794266441821, + "grad_norm": 28.102773270256623, + "kl": 0.517578125, + "learning_rate": 2.5025295109612137e-07, + "loss": 0.0005, + "reward": 3.398337244987488, + "reward_std": 0.06284121796488762, + "rewards/final_reward": 1.1384011844333064, + "rewards/mask_iou_reward": 0.5692005922166532, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.398337185382843, + "rewards/thk_ans_format_reward": 1.0, + "step": 2223, + "think_completion_length": 41.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.734375, + "epoch": 3.7554806070826308, + "grad_norm": 8.641382424478827, + "kl": 0.5625, + "learning_rate": 2.499156829679595e-07, + "loss": 0.0006, + "reward": 3.4698305130004883, + "reward_std": 0.09916340420022607, + "rewards/final_reward": 1.4339345953372138, + "rewards/mask_iou_reward": 0.7169672976686069, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4698305130004883, + "rewards/thk_ans_format_reward": 1.0, + "step": 2224, + "think_completion_length": 36.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.28125, + "epoch": 3.75716694772344, + "grad_norm": 11.69530197812389, + "kl": 0.490234375, + "learning_rate": 2.4957841483979765e-07, + "loss": 0.0005, + "reward": 3.0470253229141235, + "reward_std": 0.13745611906051636, + "rewards/final_reward": 1.1631897760026755, + "rewards/mask_iou_reward": 0.5815948880013377, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0470252931118011, + "rewards/thk_ans_format_reward": 1.0, + "step": 2225, + "think_completion_length": 40.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.09375, + "epoch": 3.7588532883642496, + "grad_norm": 7.5304097896928415, + "kl": 0.66796875, + "learning_rate": 2.4924114671163574e-07, + "loss": 0.0007, + "reward": 3.7693170309066772, + "reward_std": 0.23819169402122498, + "rewards/final_reward": 1.684009822965522, + "rewards/mask_iou_reward": 0.842004911482761, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7693171501159668, + "rewards/thk_ans_format_reward": 1.0, + "step": 2226, + "think_completion_length": 41.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.84375, + "epoch": 3.7605396290050592, + "grad_norm": 22.00318643891571, + "kl": 0.484375, + "learning_rate": 2.489038785834739e-07, + "loss": 0.0004, + "reward": 3.6224844455718994, + "reward_std": 0.16775443218648434, + "rewards/final_reward": 1.5701185300553635, + "rewards/mask_iou_reward": 0.7850592650276818, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6224846243858337, + "rewards/thk_ans_format_reward": 1.0, + "step": 2227, + "think_completion_length": 45.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.21875, + "epoch": 3.7622259696458684, + "grad_norm": 7.513678492251124, + "kl": 0.59375, + "learning_rate": 2.4856661045531197e-07, + "loss": 0.0006, + "reward": 3.82713782787323, + "reward_std": 0.016284896060824394, + "rewards/final_reward": 1.7292593459223653, + "rewards/mask_iou_reward": 0.8646296729611826, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8271378874778748, + "rewards/thk_ans_format_reward": 1.0, + "step": 2228, + "think_completion_length": 38.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.671875, + "epoch": 3.763912310286678, + "grad_norm": 9.628506712225574, + "kl": 0.54296875, + "learning_rate": 2.4822934232715005e-07, + "loss": 0.0005, + "reward": 3.134037733078003, + "reward_std": 0.08405065536499023, + "rewards/final_reward": 1.1449944024136807, + "rewards/mask_iou_reward": 0.5724972012068403, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1340378522872925, + "rewards/thk_ans_format_reward": 1.0, + "step": 2229, + "think_completion_length": 44.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.9375, + "epoch": 3.7655986509274872, + "grad_norm": 6.483859533323332, + "kl": 0.544921875, + "learning_rate": 2.478920741989882e-07, + "loss": 0.0005, + "reward": 2.9359426498413086, + "reward_std": 0.03952119592577219, + "rewards/final_reward": 1.0045149243772502, + "rewards/mask_iou_reward": 0.5022574621886251, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.935942679643631, + "rewards/thk_ans_format_reward": 1.0, + "step": 2230, + "think_completion_length": 47.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.0, + "epoch": 3.767284991568297, + "grad_norm": 6.321503986653633, + "kl": 0.53125, + "learning_rate": 2.475548060708263e-07, + "loss": 0.0005, + "reward": 3.5230026245117188, + "reward_std": 0.07344697206281126, + "rewards/final_reward": 1.4060531780160597, + "rewards/mask_iou_reward": 0.7030265890080298, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.523002803325653, + "rewards/thk_ans_format_reward": 1.0, + "step": 2231, + "think_completion_length": 42.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.390625, + "epoch": 3.768971332209106, + "grad_norm": 11.485002134115911, + "kl": 0.5146484375, + "learning_rate": 2.4721753794266437e-07, + "loss": 0.0005, + "reward": 3.668249011039734, + "reward_std": 0.028451272868551314, + "rewards/final_reward": 1.675905328253724, + "rewards/mask_iou_reward": 0.837952664126862, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6682490706443787, + "rewards/thk_ans_format_reward": 1.0, + "step": 2232, + "think_completion_length": 39.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.515625, + "epoch": 3.7706576728499157, + "grad_norm": 7.764965076833885, + "kl": 0.556640625, + "learning_rate": 2.468802698145025e-07, + "loss": 0.0006, + "reward": 3.082284688949585, + "reward_std": 0.13143670186400414, + "rewards/final_reward": 0.6998492752706811, + "rewards/mask_iou_reward": 0.34992463763534054, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0822848081588745, + "rewards/thk_ans_format_reward": 1.0, + "step": 2233, + "think_completion_length": 45.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.1875, + "epoch": 3.7723440134907253, + "grad_norm": 6.845152442654978, + "kl": 0.48828125, + "learning_rate": 2.4654300168634065e-07, + "loss": 0.0005, + "reward": 3.307486414909363, + "reward_std": 0.07110036723315716, + "rewards/final_reward": 1.8586305476138003, + "rewards/mask_iou_reward": 0.9293152738069002, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3074862957000732, + "rewards/thk_ans_format_reward": 1.0, + "step": 2234, + "think_completion_length": 39.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.65625, + "epoch": 3.7740303541315345, + "grad_norm": 21.161976840363057, + "kl": 0.546875, + "learning_rate": 2.4620573355817874e-07, + "loss": 0.0005, + "reward": 3.489209532737732, + "reward_std": 0.1913529559969902, + "rewards/final_reward": 1.392710809261045, + "rewards/mask_iou_reward": 0.6963554046305225, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4892095923423767, + "rewards/thk_ans_format_reward": 1.0, + "step": 2235, + "think_completion_length": 40.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.1875, + "epoch": 3.775716694772344, + "grad_norm": 41.94328118116562, + "kl": 0.568359375, + "learning_rate": 2.458684654300169e-07, + "loss": 0.0006, + "reward": 3.7272164821624756, + "reward_std": 0.10080359177663922, + "rewards/final_reward": 1.5988337732937237, + "rewards/mask_iou_reward": 0.7994168866468618, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7272164821624756, + "rewards/thk_ans_format_reward": 1.0, + "step": 2236, + "think_completion_length": 39.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.703125, + "epoch": 3.7774030354131534, + "grad_norm": 12.693738709933934, + "kl": 0.5234375, + "learning_rate": 2.4553119730185496e-07, + "loss": 0.0005, + "reward": 3.2859922647476196, + "reward_std": 0.3157341778278351, + "rewards/final_reward": 1.5607425408666726, + "rewards/mask_iou_reward": 0.7803712704333363, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.3016172647476196, + "rewards/thk_ans_format_reward": 1.0, + "step": 2237, + "think_completion_length": 38.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.984375, + "epoch": 3.779089376053963, + "grad_norm": 21.18898534386323, + "kl": 0.55859375, + "learning_rate": 2.451939291736931e-07, + "loss": 0.0006, + "reward": 3.6144726276397705, + "reward_std": 0.05499284155666828, + "rewards/final_reward": 1.5110642429760337, + "rewards/mask_iou_reward": 0.7555321214880169, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6144728064537048, + "rewards/thk_ans_format_reward": 1.0, + "step": 2238, + "think_completion_length": 36.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.296875, + "epoch": 3.780775716694772, + "grad_norm": 11.254064369218629, + "kl": 0.576171875, + "learning_rate": 2.448566610455312e-07, + "loss": 0.0006, + "reward": 3.332844614982605, + "reward_std": 0.22398744896054268, + "rewards/final_reward": 1.2413680375181946, + "rewards/mask_iou_reward": 0.6206840187590973, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.332844614982605, + "rewards/thk_ans_format_reward": 1.0, + "step": 2239, + "think_completion_length": 44.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.953125, + "epoch": 3.782462057335582, + "grad_norm": 12.948537225722228, + "kl": 0.576171875, + "learning_rate": 2.4451939291736933e-07, + "loss": 0.0006, + "reward": 3.6998353004455566, + "reward_std": 0.02471212111413479, + "rewards/final_reward": 1.7049763848571122, + "rewards/mask_iou_reward": 0.8524881924285561, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6998353004455566, + "rewards/thk_ans_format_reward": 1.0, + "step": 2240, + "think_completion_length": 43.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.390625, + "epoch": 3.7841483979763915, + "grad_norm": 10.909507483133451, + "kl": 0.4892578125, + "learning_rate": 2.441821247892074e-07, + "loss": 0.0005, + "reward": 3.4952789545059204, + "reward_std": 0.03494591638445854, + "rewards/final_reward": 1.1304556359104303, + "rewards/mask_iou_reward": 0.5652278179552152, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.49527907371521, + "rewards/thk_ans_format_reward": 1.0, + "step": 2241, + "think_completion_length": 49.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.375, + "epoch": 3.7858347386172007, + "grad_norm": 6.362501564088327, + "kl": 0.4287109375, + "learning_rate": 2.438448566610455e-07, + "loss": 0.0004, + "reward": 3.486288070678711, + "reward_std": 0.2311484133824706, + "rewards/final_reward": 1.6722780335729297, + "rewards/mask_iou_reward": 0.8361390167864649, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4862881898880005, + "rewards/thk_ans_format_reward": 1.0, + "step": 2242, + "think_completion_length": 42.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.15625, + "epoch": 3.78752107925801, + "grad_norm": 23.489230128982506, + "kl": 0.650390625, + "learning_rate": 2.4350758853288365e-07, + "loss": 0.0006, + "reward": 3.7910631895065308, + "reward_std": 0.031536445720121264, + "rewards/final_reward": 1.8889532907636273, + "rewards/mask_iou_reward": 0.9444766453818136, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7910633087158203, + "rewards/thk_ans_format_reward": 1.0, + "step": 2243, + "think_completion_length": 35.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.546875, + "epoch": 3.7892074198988195, + "grad_norm": 8.982080291054826, + "kl": 0.548828125, + "learning_rate": 2.4317032040472173e-07, + "loss": 0.0006, + "reward": 3.8048187494277954, + "reward_std": 0.06128034554421902, + "rewards/final_reward": 1.9306495480027581, + "rewards/mask_iou_reward": 0.9653247740013791, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.804818868637085, + "rewards/thk_ans_format_reward": 1.0, + "step": 2244, + "think_completion_length": 38.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.25, + "epoch": 3.790893760539629, + "grad_norm": 9.209614953926765, + "kl": 0.525390625, + "learning_rate": 2.428330522765598e-07, + "loss": 0.0005, + "reward": 3.5684866905212402, + "reward_std": 0.06634041853249073, + "rewards/final_reward": 1.7982114449418654, + "rewards/mask_iou_reward": 0.8991057224709327, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5684868097305298, + "rewards/thk_ans_format_reward": 1.0, + "step": 2245, + "think_completion_length": 39.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.703125, + "epoch": 3.7925801011804383, + "grad_norm": 7.330973313643778, + "kl": 0.587890625, + "learning_rate": 2.4249578414839796e-07, + "loss": 0.0006, + "reward": 3.0241518020629883, + "reward_std": 0.0621052160859108, + "rewards/final_reward": 0.7467765296325852, + "rewards/mask_iou_reward": 0.3733882648162926, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.024151861667633, + "rewards/thk_ans_format_reward": 1.0, + "step": 2246, + "think_completion_length": 44.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.890625, + "epoch": 3.794266441821248, + "grad_norm": 6.687208782228291, + "kl": 0.564453125, + "learning_rate": 2.4215851602023605e-07, + "loss": 0.0005, + "reward": 3.8984771966934204, + "reward_std": 0.0070637313183397055, + "rewards/final_reward": 1.9246973970811951, + "rewards/mask_iou_reward": 0.9623486985405976, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8984771966934204, + "rewards/thk_ans_format_reward": 1.0, + "step": 2247, + "think_completion_length": 38.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.359375, + "epoch": 3.7959527824620576, + "grad_norm": 9.61095483768434, + "kl": 0.603515625, + "learning_rate": 2.418212478920742e-07, + "loss": 0.0006, + "reward": 3.569265365600586, + "reward_std": 0.0774321025237441, + "rewards/final_reward": 1.8779691392941031, + "rewards/mask_iou_reward": 0.9389845696470516, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5692653059959412, + "rewards/thk_ans_format_reward": 1.0, + "step": 2248, + "think_completion_length": 39.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.390625, + "epoch": 3.7976391231028668, + "grad_norm": 9.22119229655192, + "kl": 0.53515625, + "learning_rate": 2.414839797639123e-07, + "loss": 0.0005, + "reward": 3.323352098464966, + "reward_std": 0.22597427666187286, + "rewards/final_reward": 1.6997667859197665, + "rewards/mask_iou_reward": 0.8498833929598832, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3233520984649658, + "rewards/thk_ans_format_reward": 1.0, + "step": 2249, + "think_completion_length": 42.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.4375, + "epoch": 3.799325463743676, + "grad_norm": 14.710195243504126, + "kl": 0.52734375, + "learning_rate": 2.411467116357504e-07, + "loss": 0.0005, + "reward": 3.4668972492218018, + "reward_std": 0.14925647154450417, + "rewards/final_reward": 1.6721082015758792, + "rewards/mask_iou_reward": 0.8360541007879396, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4668973684310913, + "rewards/thk_ans_format_reward": 1.0, + "step": 2250, + "think_completion_length": 41.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.21875, + "epoch": 3.8010118043844856, + "grad_norm": 8.207460656492335, + "kl": 0.552734375, + "learning_rate": 2.408094435075885e-07, + "loss": 0.0006, + "reward": 3.2669492959976196, + "reward_std": 0.20546810171799734, + "rewards/final_reward": 1.0126203292625804, + "rewards/mask_iou_reward": 0.5063101646312902, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2669492959976196, + "rewards/thk_ans_format_reward": 1.0, + "step": 2251, + "think_completion_length": 38.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.8125, + "epoch": 3.8026981450252952, + "grad_norm": 12.870806106607096, + "kl": 3.0625, + "learning_rate": 2.4047217537942665e-07, + "loss": 0.003, + "reward": 3.3490335941314697, + "reward_std": 0.19104180857539177, + "rewards/final_reward": 1.4805655172421521, + "rewards/mask_iou_reward": 0.7402827586210761, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.3646584749221802, + "rewards/thk_ans_format_reward": 1.0, + "step": 2252, + "think_completion_length": 40.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.0625, + "epoch": 3.8043844856661044, + "grad_norm": 7.855936861729753, + "kl": 0.6640625, + "learning_rate": 2.4013490725126473e-07, + "loss": 0.0007, + "reward": 3.355563998222351, + "reward_std": 0.1781761646270752, + "rewards/final_reward": 1.064294652552745, + "rewards/mask_iou_reward": 0.5321473262763725, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3555639386177063, + "rewards/thk_ans_format_reward": 1.0, + "step": 2253, + "think_completion_length": 45.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.078125, + "epoch": 3.806070826306914, + "grad_norm": 7.972201657350866, + "kl": 0.56640625, + "learning_rate": 2.3979763912310287e-07, + "loss": 0.0006, + "reward": 3.093757748603821, + "reward_std": 0.11396299209445715, + "rewards/final_reward": 0.8799707201905502, + "rewards/mask_iou_reward": 0.4399853600952751, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0937578082084656, + "rewards/thk_ans_format_reward": 1.0, + "step": 2254, + "think_completion_length": 41.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.46875, + "epoch": 3.8077571669477237, + "grad_norm": 14.963502770880439, + "kl": 0.640625, + "learning_rate": 2.3946037099494096e-07, + "loss": 0.0006, + "reward": 3.5387972593307495, + "reward_std": 0.12345702201128006, + "rewards/final_reward": 1.6928634493897015, + "rewards/mask_iou_reward": 0.8464317246948507, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.538797378540039, + "rewards/thk_ans_format_reward": 1.0, + "step": 2255, + "think_completion_length": 37.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.71875, + "epoch": 3.809443507588533, + "grad_norm": 6.000778542491872, + "kl": 0.62109375, + "learning_rate": 2.391231028667791e-07, + "loss": 0.0006, + "reward": 3.505826711654663, + "reward_std": 0.14541307091712952, + "rewards/final_reward": 1.323766110955759, + "rewards/mask_iou_reward": 0.6618830554778795, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5058266520500183, + "rewards/thk_ans_format_reward": 1.0, + "step": 2256, + "think_completion_length": 37.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.0, + "epoch": 3.811129848229342, + "grad_norm": 7.77969270674984, + "kl": 0.47265625, + "learning_rate": 2.387858347386172e-07, + "loss": 0.0005, + "reward": 3.2499868869781494, + "reward_std": 0.046357049606740475, + "rewards/final_reward": 1.153350762684806, + "rewards/mask_iou_reward": 0.576675381342403, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.249986857175827, + "rewards/thk_ans_format_reward": 1.0, + "step": 2257, + "think_completion_length": 46.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.171875, + "epoch": 3.8128161888701517, + "grad_norm": 5.7795329042659525, + "kl": 0.537109375, + "learning_rate": 2.384485666104553e-07, + "loss": 0.0005, + "reward": 3.6921584606170654, + "reward_std": 0.0910279038362205, + "rewards/final_reward": 1.607268500317296, + "rewards/mask_iou_reward": 0.803634250158648, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6921584606170654, + "rewards/thk_ans_format_reward": 1.0, + "step": 2258, + "think_completion_length": 46.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.0, + "epoch": 3.8145025295109614, + "grad_norm": 44.85207032598342, + "kl": 0.4765625, + "learning_rate": 2.3811129848229342e-07, + "loss": 0.0005, + "reward": 3.449580430984497, + "reward_std": 0.15453584492206573, + "rewards/final_reward": 1.455731098691922, + "rewards/mask_iou_reward": 0.727865549345961, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4495803713798523, + "rewards/thk_ans_format_reward": 1.0, + "step": 2259, + "think_completion_length": 45.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.578125, + "epoch": 3.8161888701517706, + "grad_norm": 8.150016547919764, + "kl": 0.5, + "learning_rate": 2.377740303541315e-07, + "loss": 0.0005, + "reward": 3.3544111251831055, + "reward_std": 0.04027549549937248, + "rewards/final_reward": 1.8442765117650843, + "rewards/mask_iou_reward": 0.9221382558825422, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3544110655784607, + "rewards/thk_ans_format_reward": 1.0, + "step": 2260, + "think_completion_length": 40.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.1875, + "epoch": 3.81787521079258, + "grad_norm": 22.799975985239875, + "kl": 0.5546875, + "learning_rate": 2.3743676222596964e-07, + "loss": 0.0006, + "reward": 3.2492820024490356, + "reward_std": 0.0645350944250822, + "rewards/final_reward": 0.7860365323414302, + "rewards/mask_iou_reward": 0.3930182661707151, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2492821216583252, + "rewards/thk_ans_format_reward": 1.0, + "step": 2261, + "think_completion_length": 40.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.125, + "epoch": 3.8195615514333894, + "grad_norm": 9.297461895727002, + "kl": 0.6875, + "learning_rate": 2.3709949409780776e-07, + "loss": 0.0007, + "reward": 3.1288259029388428, + "reward_std": 0.10558873787522316, + "rewards/final_reward": 0.5146808146562, + "rewards/mask_iou_reward": 0.2573404073281, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1288259029388428, + "rewards/thk_ans_format_reward": 1.0, + "step": 2262, + "think_completion_length": 43.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.140625, + "epoch": 3.821247892074199, + "grad_norm": 6.6976524641599875, + "kl": 0.564453125, + "learning_rate": 2.3676222596964585e-07, + "loss": 0.0005, + "reward": 3.7508952617645264, + "reward_std": 0.07979346811771393, + "rewards/final_reward": 1.9381548986924892, + "rewards/mask_iou_reward": 0.9690774493462446, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7508953213691711, + "rewards/thk_ans_format_reward": 1.0, + "step": 2263, + "think_completion_length": 37.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.53125, + "epoch": 3.822934232715008, + "grad_norm": 6.993854221440401, + "kl": 0.55078125, + "learning_rate": 2.3642495784148399e-07, + "loss": 0.0005, + "reward": 3.3509509563446045, + "reward_std": 0.08846403658390045, + "rewards/final_reward": 0.9924140661206481, + "rewards/mask_iou_reward": 0.49620703306032404, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3509510159492493, + "rewards/thk_ans_format_reward": 1.0, + "step": 2264, + "think_completion_length": 40.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.0625, + "epoch": 3.824620573355818, + "grad_norm": 7.1760436728186106, + "kl": 0.61328125, + "learning_rate": 2.3608768971332207e-07, + "loss": 0.0006, + "reward": 3.4479448795318604, + "reward_std": 0.022803470492362976, + "rewards/final_reward": 1.0529762239628047, + "rewards/mask_iou_reward": 0.5264881119814023, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4479448199272156, + "rewards/thk_ans_format_reward": 1.0, + "step": 2265, + "think_completion_length": 44.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.96875, + "epoch": 3.8263069139966275, + "grad_norm": 15.65109177458448, + "kl": 0.572265625, + "learning_rate": 2.357504215851602e-07, + "loss": 0.0006, + "reward": 3.2824543714523315, + "reward_std": 0.06921002082526684, + "rewards/final_reward": 1.439779531366369, + "rewards/mask_iou_reward": 0.7198897656831845, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2824545502662659, + "rewards/thk_ans_format_reward": 1.0, + "step": 2266, + "think_completion_length": 42.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.453125, + "epoch": 3.8279932546374367, + "grad_norm": 16.79945725064245, + "kl": 0.568359375, + "learning_rate": 2.354131534569983e-07, + "loss": 0.0006, + "reward": 3.217270016670227, + "reward_std": 0.03682664316147566, + "rewards/final_reward": 1.5846480653921238, + "rewards/mask_iou_reward": 0.7923240326960619, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.217270016670227, + "rewards/thk_ans_format_reward": 1.0, + "step": 2267, + "think_completion_length": 38.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.5625, + "epoch": 3.8296795952782463, + "grad_norm": 8.20014333565937, + "kl": 0.587890625, + "learning_rate": 2.3507588532883641e-07, + "loss": 0.0006, + "reward": 3.5661516189575195, + "reward_std": 0.022400468587875366, + "rewards/final_reward": 1.853252251249863, + "rewards/mask_iou_reward": 0.9266261256249315, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.56615149974823, + "rewards/thk_ans_format_reward": 1.0, + "step": 2268, + "think_completion_length": 37.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.015625, + "epoch": 3.8313659359190555, + "grad_norm": 30.934744592352697, + "kl": 0.5859375, + "learning_rate": 2.3473861720067453e-07, + "loss": 0.0006, + "reward": 3.341153144836426, + "reward_std": 0.27450861036777496, + "rewards/final_reward": 1.215352754567551, + "rewards/mask_iou_reward": 0.6076763772837755, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3411532640457153, + "rewards/thk_ans_format_reward": 1.0, + "step": 2269, + "think_completion_length": 40.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.171875, + "epoch": 3.833052276559865, + "grad_norm": 12.834656985151206, + "kl": 0.48828125, + "learning_rate": 2.3440134907251264e-07, + "loss": 0.0005, + "reward": 3.2810736894607544, + "reward_std": 0.18031561793759465, + "rewards/final_reward": 0.9614180023944365, + "rewards/mask_iou_reward": 0.48070900119721827, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2810736298561096, + "rewards/thk_ans_format_reward": 1.0, + "step": 2270, + "think_completion_length": 42.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.140625, + "epoch": 3.8347386172006743, + "grad_norm": 14.12803405633131, + "kl": 0.55078125, + "learning_rate": 2.3406408094435076e-07, + "loss": 0.0006, + "reward": 3.6309303045272827, + "reward_std": 0.07168065011501312, + "rewards/final_reward": 1.728945760123278, + "rewards/mask_iou_reward": 0.864472880061639, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6309301853179932, + "rewards/thk_ans_format_reward": 1.0, + "step": 2271, + "think_completion_length": 40.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.921875, + "epoch": 3.836424957841484, + "grad_norm": 5.372312685751642, + "kl": 0.5244140625, + "learning_rate": 2.3372681281618887e-07, + "loss": 0.0005, + "reward": 3.1526389122009277, + "reward_std": 0.2288635354489088, + "rewards/final_reward": 1.4706292807385708, + "rewards/mask_iou_reward": 0.7353146403692854, + "rewards/sam_format_reward": 0.953125, + "rewards/sam_reward_func_ultra": 1.1995137929916382, + "rewards/thk_ans_format_reward": 1.0, + "step": 2272, + "think_completion_length": 38.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.765625, + "epoch": 3.8381112984822936, + "grad_norm": 97.88424337284546, + "kl": 0.525390625, + "learning_rate": 2.3338954468802696e-07, + "loss": 0.0005, + "reward": 3.5338401794433594, + "reward_std": 0.19119788333773613, + "rewards/final_reward": 1.326255627713599, + "rewards/mask_iou_reward": 0.6631278138567995, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5338401794433594, + "rewards/thk_ans_format_reward": 1.0, + "step": 2273, + "think_completion_length": 43.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.765625, + "epoch": 3.839797639123103, + "grad_norm": 12.919983043098979, + "kl": 0.55078125, + "learning_rate": 2.330522765598651e-07, + "loss": 0.0005, + "reward": 3.437851667404175, + "reward_std": 0.02033051522448659, + "rewards/final_reward": 1.091737954300925, + "rewards/mask_iou_reward": 0.5458689771504625, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4378515779972076, + "rewards/thk_ans_format_reward": 1.0, + "step": 2274, + "think_completion_length": 42.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.890625, + "epoch": 3.8414839797639124, + "grad_norm": 112.94144962199447, + "kl": 0.521484375, + "learning_rate": 2.3271500843170318e-07, + "loss": 0.0005, + "reward": 3.4854648113250732, + "reward_std": 0.0527753047645092, + "rewards/final_reward": 1.4952481048274144, + "rewards/mask_iou_reward": 0.7476240524137072, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4854648113250732, + "rewards/thk_ans_format_reward": 1.0, + "step": 2275, + "think_completion_length": 42.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.140625, + "epoch": 3.8431703204047216, + "grad_norm": 14.33435507420024, + "kl": 0.568359375, + "learning_rate": 2.323777403035413e-07, + "loss": 0.0006, + "reward": 3.5199133157730103, + "reward_std": 0.04245698405429721, + "rewards/final_reward": 1.245931844644452, + "rewards/mask_iou_reward": 0.622965922322226, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5199132561683655, + "rewards/thk_ans_format_reward": 1.0, + "step": 2276, + "think_completion_length": 44.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.203125, + "epoch": 3.8448566610455313, + "grad_norm": 5.219698782691755, + "kl": 0.4921875, + "learning_rate": 2.320404721753794e-07, + "loss": 0.0005, + "reward": 3.1729589700698853, + "reward_std": 0.08763368986546993, + "rewards/final_reward": 1.6196611753146797, + "rewards/mask_iou_reward": 0.8098305876573398, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1729588210582733, + "rewards/thk_ans_format_reward": 1.0, + "step": 2277, + "think_completion_length": 47.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.734375, + "epoch": 3.8465430016863404, + "grad_norm": 8.98229960876335, + "kl": 0.5234375, + "learning_rate": 2.3170320404721753e-07, + "loss": 0.0005, + "reward": 3.2349685430526733, + "reward_std": 0.035814208909869194, + "rewards/final_reward": 0.9417275861362087, + "rewards/mask_iou_reward": 0.47086379306810433, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2349684834480286, + "rewards/thk_ans_format_reward": 1.0, + "step": 2278, + "think_completion_length": 47.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.65625, + "epoch": 3.84822934232715, + "grad_norm": 13.066641019019265, + "kl": 0.5, + "learning_rate": 2.3136593591905564e-07, + "loss": 0.0005, + "reward": 3.577161431312561, + "reward_std": 0.1769073959439993, + "rewards/final_reward": 1.7993820008262598, + "rewards/mask_iou_reward": 0.8996910004131299, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5771613717079163, + "rewards/thk_ans_format_reward": 1.0, + "step": 2279, + "think_completion_length": 45.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.46875, + "epoch": 3.8499156829679597, + "grad_norm": 16.36272162499701, + "kl": 0.54296875, + "learning_rate": 2.3102866779089375e-07, + "loss": 0.0005, + "reward": 3.342397093772888, + "reward_std": 0.05663332901895046, + "rewards/final_reward": 1.3201730435836965, + "rewards/mask_iou_reward": 0.6600865217918482, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.342397153377533, + "rewards/thk_ans_format_reward": 1.0, + "step": 2280, + "think_completion_length": 42.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.953125, + "epoch": 3.851602023608769, + "grad_norm": 21.31976206344704, + "kl": 0.564453125, + "learning_rate": 2.3069139966273184e-07, + "loss": 0.0006, + "reward": 3.7095367908477783, + "reward_std": 0.09649944491684437, + "rewards/final_reward": 1.7138956461854962, + "rewards/mask_iou_reward": 0.8569478230927481, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7095369100570679, + "rewards/thk_ans_format_reward": 1.0, + "step": 2281, + "think_completion_length": 42.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.25, + "epoch": 3.8532883642495785, + "grad_norm": 26.774222606813684, + "kl": 0.5556640625, + "learning_rate": 2.3035413153456998e-07, + "loss": 0.0006, + "reward": 3.27087664604187, + "reward_std": 0.27236051857471466, + "rewards/final_reward": 1.299968729081332, + "rewards/mask_iou_reward": 0.649984364540666, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2708768248558044, + "rewards/thk_ans_format_reward": 1.0, + "step": 2282, + "think_completion_length": 40.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.6875, + "epoch": 3.8549747048903877, + "grad_norm": 11.647422897155579, + "kl": 0.53515625, + "learning_rate": 2.3001686340640807e-07, + "loss": 0.0006, + "reward": 3.6066473722457886, + "reward_std": 0.037033793749287724, + "rewards/final_reward": 1.7790633561622415, + "rewards/mask_iou_reward": 0.8895316780811208, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6066473126411438, + "rewards/thk_ans_format_reward": 1.0, + "step": 2283, + "think_completion_length": 46.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.28125, + "epoch": 3.8566610455311974, + "grad_norm": 6.399696029593912, + "kl": 0.59765625, + "learning_rate": 2.296795952782462e-07, + "loss": 0.0006, + "reward": 3.1298261880874634, + "reward_std": 0.135479424148798, + "rewards/final_reward": 1.0958613893826303, + "rewards/mask_iou_reward": 0.5479306946913152, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1298261880874634, + "rewards/thk_ans_format_reward": 1.0, + "step": 2284, + "think_completion_length": 39.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 231.890625, + "epoch": 3.8583473861720066, + "grad_norm": 12.53030051498087, + "kl": 0.470703125, + "learning_rate": 2.293423271500843e-07, + "loss": 0.0005, + "reward": 3.647843360900879, + "reward_std": 0.18895704671740532, + "rewards/final_reward": 1.4987438065983483, + "rewards/mask_iou_reward": 0.7493719032991741, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6478431224822998, + "rewards/thk_ans_format_reward": 1.0, + "step": 2285, + "think_completion_length": 43.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.859375, + "epoch": 3.860033726812816, + "grad_norm": 8.084188059338498, + "kl": 0.56640625, + "learning_rate": 2.290050590219224e-07, + "loss": 0.0006, + "reward": 3.7369555234909058, + "reward_std": 0.12819246295839548, + "rewards/final_reward": 1.6847100480336998, + "rewards/mask_iou_reward": 0.8423550240168499, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7369555830955505, + "rewards/thk_ans_format_reward": 1.0, + "step": 2286, + "think_completion_length": 42.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 213.9375, + "epoch": 3.861720067453626, + "grad_norm": 13.772257072069493, + "kl": 0.4736328125, + "learning_rate": 2.2866779089376052e-07, + "loss": 0.0005, + "reward": 3.70966899394989, + "reward_std": 0.07207040954381227, + "rewards/final_reward": 1.501267929796344, + "rewards/mask_iou_reward": 0.750633964898172, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.709669053554535, + "rewards/thk_ans_format_reward": 1.0, + "step": 2287, + "think_completion_length": 44.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.515625, + "epoch": 3.863406408094435, + "grad_norm": 16.33337265919695, + "kl": 0.537109375, + "learning_rate": 2.2833052276559864e-07, + "loss": 0.0005, + "reward": 3.5507311820983887, + "reward_std": 0.11423347145318985, + "rewards/final_reward": 1.7487370427616498, + "rewards/mask_iou_reward": 0.8743685213808249, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5507311820983887, + "rewards/thk_ans_format_reward": 1.0, + "step": 2288, + "think_completion_length": 40.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.328125, + "epoch": 3.8650927487352447, + "grad_norm": 7.145956043564162, + "kl": 0.541015625, + "learning_rate": 2.2799325463743673e-07, + "loss": 0.0005, + "reward": 2.844885468482971, + "reward_std": 0.35542061924934387, + "rewards/final_reward": 0.911483163947697, + "rewards/mask_iou_reward": 0.4557415819738485, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8448854386806488, + "rewards/thk_ans_format_reward": 1.0, + "step": 2289, + "think_completion_length": 45.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 231.953125, + "epoch": 3.866779089376054, + "grad_norm": 53.96931329998067, + "kl": 0.546875, + "learning_rate": 2.2765598650927487e-07, + "loss": 0.0005, + "reward": 3.043015956878662, + "reward_std": 0.4434027671813965, + "rewards/final_reward": 0.5968193481833238, + "rewards/mask_iou_reward": 0.2984096740916619, + "rewards/sam_format_reward": 0.953125, + "rewards/sam_reward_func_ultra": 1.136765956878662, + "rewards/thk_ans_format_reward": 0.953125, + "step": 2290, + "think_completion_length": 43.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.671875, + "epoch": 3.8684654300168635, + "grad_norm": 10.023266830041628, + "kl": 0.63671875, + "learning_rate": 2.2731871838111298e-07, + "loss": 0.0006, + "reward": 2.770586848258972, + "reward_std": 0.19898640364408493, + "rewards/final_reward": 1.2640052339226235, + "rewards/mask_iou_reward": 0.6320026169613118, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7705867886543274, + "rewards/thk_ans_format_reward": 1.0, + "step": 2291, + "think_completion_length": 35.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.84375, + "epoch": 3.8701517706576727, + "grad_norm": 15.042473029968052, + "kl": 0.57421875, + "learning_rate": 2.269814502529511e-07, + "loss": 0.0006, + "reward": 3.5436513423919678, + "reward_std": 0.007338247261941433, + "rewards/final_reward": 1.1669293771860252, + "rewards/mask_iou_reward": 0.5834646885930126, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5436512231826782, + "rewards/thk_ans_format_reward": 1.0, + "step": 2292, + "think_completion_length": 42.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.234375, + "epoch": 3.8718381112984823, + "grad_norm": 4.985518385493025, + "kl": 0.4462890625, + "learning_rate": 2.266441821247892e-07, + "loss": 0.0004, + "reward": 3.6346672773361206, + "reward_std": 0.05469698668457568, + "rewards/final_reward": 1.8549077323712804, + "rewards/mask_iou_reward": 0.9274538661856402, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.634667456150055, + "rewards/thk_ans_format_reward": 1.0, + "step": 2293, + "think_completion_length": 39.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.34375, + "epoch": 3.873524451939292, + "grad_norm": 8.984147115120505, + "kl": 0.533203125, + "learning_rate": 2.263069139966273e-07, + "loss": 0.0005, + "reward": 3.6444458961486816, + "reward_std": 0.11999626411125064, + "rewards/final_reward": 1.694625798370633, + "rewards/mask_iou_reward": 0.8473128991853165, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6444460153579712, + "rewards/thk_ans_format_reward": 1.0, + "step": 2294, + "think_completion_length": 40.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.875, + "epoch": 3.875210792580101, + "grad_norm": 10.732266711491457, + "kl": 0.53125, + "learning_rate": 2.2596964586846544e-07, + "loss": 0.0005, + "reward": 3.298237681388855, + "reward_std": 0.07919766753911972, + "rewards/final_reward": 1.287060814014629, + "rewards/mask_iou_reward": 0.6435304070073145, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2982377409934998, + "rewards/thk_ans_format_reward": 1.0, + "step": 2295, + "think_completion_length": 41.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.421875, + "epoch": 3.876897133220911, + "grad_norm": 17.229923944564966, + "kl": 0.552734375, + "learning_rate": 2.2563237774030352e-07, + "loss": 0.0006, + "reward": 3.177481770515442, + "reward_std": 0.33257442712783813, + "rewards/final_reward": 1.1130151611958692, + "rewards/mask_iou_reward": 0.5565075805979346, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1774817407131195, + "rewards/thk_ans_format_reward": 1.0, + "step": 2296, + "think_completion_length": 37.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.515625, + "epoch": 3.87858347386172, + "grad_norm": 7.660805708045417, + "kl": 0.515625, + "learning_rate": 2.2529510961214166e-07, + "loss": 0.0005, + "reward": 3.45150625705719, + "reward_std": 0.13643109984695911, + "rewards/final_reward": 1.5186618102721703, + "rewards/mask_iou_reward": 0.7593309051360851, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4515064358711243, + "rewards/thk_ans_format_reward": 1.0, + "step": 2297, + "think_completion_length": 42.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.0625, + "epoch": 3.8802698145025296, + "grad_norm": 9.538622827190537, + "kl": 0.53515625, + "learning_rate": 2.2495784148397975e-07, + "loss": 0.0005, + "reward": 3.696483016014099, + "reward_std": 0.07705111056566238, + "rewards/final_reward": 1.8266000804889195, + "rewards/mask_iou_reward": 0.9133000402444598, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6964829564094543, + "rewards/thk_ans_format_reward": 1.0, + "step": 2298, + "think_completion_length": 41.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.203125, + "epoch": 3.881956155143339, + "grad_norm": 61.88683587213132, + "kl": 0.4921875, + "learning_rate": 2.2462057335581786e-07, + "loss": 0.0005, + "reward": 3.261089324951172, + "reward_std": 0.3139321506023407, + "rewards/final_reward": 1.5563992586356612, + "rewards/mask_iou_reward": 0.7781996293178306, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2610894441604614, + "rewards/thk_ans_format_reward": 1.0, + "step": 2299, + "think_completion_length": 43.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.25, + "epoch": 3.8836424957841484, + "grad_norm": 14.236252838419876, + "kl": 0.5625, + "learning_rate": 2.2428330522765598e-07, + "loss": 0.0006, + "reward": 3.439433217048645, + "reward_std": 0.03572419285774231, + "rewards/final_reward": 1.9520528977990645, + "rewards/mask_iou_reward": 0.9760264488995323, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4394332766532898, + "rewards/thk_ans_format_reward": 1.0, + "step": 2300, + "think_completion_length": 38.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.296875, + "epoch": 3.885328836424958, + "grad_norm": 8.491205443011415, + "kl": 0.521484375, + "learning_rate": 2.239460370994941e-07, + "loss": 0.0005, + "reward": 3.5966683626174927, + "reward_std": 0.05547321029007435, + "rewards/final_reward": 1.6187499078921728, + "rewards/mask_iou_reward": 0.8093749539460864, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.596668303012848, + "rewards/thk_ans_format_reward": 1.0, + "step": 2301, + "think_completion_length": 40.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.625, + "epoch": 3.8870151770657673, + "grad_norm": 9.93154426736727, + "kl": 0.529296875, + "learning_rate": 2.236087689713322e-07, + "loss": 0.0005, + "reward": 3.212773323059082, + "reward_std": 0.04713407810777426, + "rewards/final_reward": 1.1338621025049003, + "rewards/mask_iou_reward": 0.5669310512524501, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2127732634544373, + "rewards/thk_ans_format_reward": 1.0, + "step": 2302, + "think_completion_length": 45.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.125, + "epoch": 3.8887015177065765, + "grad_norm": 7.367578602723742, + "kl": 0.560546875, + "learning_rate": 2.2327150084317032e-07, + "loss": 0.0006, + "reward": 3.3229422569274902, + "reward_std": 0.18621986359357834, + "rewards/final_reward": 1.3275170722478604, + "rewards/mask_iou_reward": 0.6637585361239302, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.322942316532135, + "rewards/thk_ans_format_reward": 1.0, + "step": 2303, + "think_completion_length": 42.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.3125, + "epoch": 3.890387858347386, + "grad_norm": 6.136457414582041, + "kl": 0.5322265625, + "learning_rate": 2.229342327150084e-07, + "loss": 0.0005, + "reward": 3.64126718044281, + "reward_std": 0.048449140042066574, + "rewards/final_reward": 1.8254532397981125, + "rewards/mask_iou_reward": 0.9127266198990562, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6412672400474548, + "rewards/thk_ans_format_reward": 1.0, + "step": 2304, + "think_completion_length": 39.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.5625, + "epoch": 3.8920741989881957, + "grad_norm": 33.440203072134736, + "kl": 0.556640625, + "learning_rate": 2.2259696458684655e-07, + "loss": 0.0006, + "reward": 3.7791318893432617, + "reward_std": 0.054630931466817856, + "rewards/final_reward": 1.8482051781942652, + "rewards/mask_iou_reward": 0.9241025890971326, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7791318893432617, + "rewards/thk_ans_format_reward": 1.0, + "step": 2305, + "think_completion_length": 43.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.890625, + "epoch": 3.893760539629005, + "grad_norm": 43.10138167582071, + "kl": 0.5234375, + "learning_rate": 2.2225969645868464e-07, + "loss": 0.0005, + "reward": 3.2736620903015137, + "reward_std": 0.4450536370277405, + "rewards/final_reward": 1.3462738375611605, + "rewards/mask_iou_reward": 0.6731369187805802, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.2892871499061584, + "rewards/thk_ans_format_reward": 1.0, + "step": 2306, + "think_completion_length": 48.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.4375, + "epoch": 3.8954468802698146, + "grad_norm": 8.890895899704827, + "kl": 0.529296875, + "learning_rate": 2.2192242833052275e-07, + "loss": 0.0005, + "reward": 3.341302990913391, + "reward_std": 0.10977509245276451, + "rewards/final_reward": 1.0956768907996606, + "rewards/mask_iou_reward": 0.5478384453998303, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3413029909133911, + "rewards/thk_ans_format_reward": 1.0, + "step": 2307, + "think_completion_length": 44.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.546875, + "epoch": 3.897133220910624, + "grad_norm": 4.020982898499106, + "kl": 0.4716796875, + "learning_rate": 2.2158516020236086e-07, + "loss": 0.0004, + "reward": 3.4928349256515503, + "reward_std": 0.053374568466097116, + "rewards/final_reward": 1.880111202432228, + "rewards/mask_iou_reward": 0.940055601216114, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4928348660469055, + "rewards/thk_ans_format_reward": 1.0, + "step": 2308, + "think_completion_length": 39.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.734375, + "epoch": 3.8988195615514334, + "grad_norm": 8.190335018830517, + "kl": 0.60546875, + "learning_rate": 2.2124789207419898e-07, + "loss": 0.0006, + "reward": 3.276050329208374, + "reward_std": 0.14282017201185226, + "rewards/final_reward": 1.581921026116359, + "rewards/mask_iou_reward": 0.7909605130581795, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2760505676269531, + "rewards/thk_ans_format_reward": 1.0, + "step": 2309, + "think_completion_length": 41.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.828125, + "epoch": 3.9005059021922426, + "grad_norm": 12.560647227806385, + "kl": 0.48046875, + "learning_rate": 2.209106239460371e-07, + "loss": 0.0005, + "reward": 3.7485271692276, + "reward_std": 0.062254197895526886, + "rewards/final_reward": 1.8556166528903737, + "rewards/mask_iou_reward": 0.9278083264451868, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7485272288322449, + "rewards/thk_ans_format_reward": 1.0, + "step": 2310, + "think_completion_length": 40.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.34375, + "epoch": 3.902192242833052, + "grad_norm": 11.193565133332825, + "kl": 0.583984375, + "learning_rate": 2.205733558178752e-07, + "loss": 0.0006, + "reward": 3.7668726444244385, + "reward_std": 0.020487097091972828, + "rewards/final_reward": 1.5766710877741281, + "rewards/mask_iou_reward": 0.7883355438870641, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7668728232383728, + "rewards/thk_ans_format_reward": 1.0, + "step": 2311, + "think_completion_length": 45.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.609375, + "epoch": 3.903878583473862, + "grad_norm": 16.8454073316175, + "kl": 0.5361328125, + "learning_rate": 2.202360876897133e-07, + "loss": 0.0005, + "reward": 2.701531767845154, + "reward_std": 0.2758069708943367, + "rewards/final_reward": 0.5598322417967414, + "rewards/mask_iou_reward": 0.2799161208983707, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 0.732781708240509, + "rewards/thk_ans_format_reward": 0.984375, + "step": 2312, + "think_completion_length": 41.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.734375, + "epoch": 3.905564924114671, + "grad_norm": 6.042807399278246, + "kl": 0.533203125, + "learning_rate": 2.1989881956155143e-07, + "loss": 0.0005, + "reward": 3.065964460372925, + "reward_std": 0.05612972844392061, + "rewards/final_reward": 1.332514875414373, + "rewards/mask_iou_reward": 0.6662574377071865, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0659645199775696, + "rewards/thk_ans_format_reward": 1.0, + "step": 2313, + "think_completion_length": 44.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.09375, + "epoch": 3.9072512647554807, + "grad_norm": 20.914529546989716, + "kl": 0.6171875, + "learning_rate": 2.1956155143338952e-07, + "loss": 0.0006, + "reward": 3.776341199874878, + "reward_std": 0.06756392121315002, + "rewards/final_reward": 1.6560440449945635, + "rewards/mask_iou_reward": 0.8280220224972817, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7763413190841675, + "rewards/thk_ans_format_reward": 1.0, + "step": 2314, + "think_completion_length": 51.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.953125, + "epoch": 3.9089376053962903, + "grad_norm": 7.652094886113456, + "kl": 0.62109375, + "learning_rate": 2.1922428330522766e-07, + "loss": 0.0007, + "reward": 3.4693918228149414, + "reward_std": 0.07247776072472334, + "rewards/final_reward": 1.3367317342178633, + "rewards/mask_iou_reward": 0.6683658671089316, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4693917036056519, + "rewards/thk_ans_format_reward": 1.0, + "step": 2315, + "think_completion_length": 40.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.65625, + "epoch": 3.9106239460370995, + "grad_norm": 16.51306856454786, + "kl": 0.4873046875, + "learning_rate": 2.1888701517706575e-07, + "loss": 0.0005, + "reward": 3.5613391399383545, + "reward_std": 0.20852696895599365, + "rewards/final_reward": 1.490526290384657, + "rewards/mask_iou_reward": 0.7452631451923285, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5613394379615784, + "rewards/thk_ans_format_reward": 1.0, + "step": 2316, + "think_completion_length": 42.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.65625, + "epoch": 3.9123102866779087, + "grad_norm": 13.014352060898924, + "kl": 0.58203125, + "learning_rate": 2.1854974704890386e-07, + "loss": 0.0006, + "reward": 3.795401930809021, + "reward_std": 0.07416247483342886, + "rewards/final_reward": 1.679278486730068, + "rewards/mask_iou_reward": 0.839639243365034, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7954018115997314, + "rewards/thk_ans_format_reward": 1.0, + "step": 2317, + "think_completion_length": 46.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.453125, + "epoch": 3.9139966273187183, + "grad_norm": 8.64054859551105, + "kl": 0.4921875, + "learning_rate": 2.1821247892074197e-07, + "loss": 0.0005, + "reward": 3.2304306030273438, + "reward_std": 0.41698751598596573, + "rewards/final_reward": 1.578626626927155, + "rewards/mask_iou_reward": 0.7893133134635775, + "rewards/sam_format_reward": 0.953125, + "rewards/sam_reward_func_ultra": 1.324180543422699, + "rewards/thk_ans_format_reward": 0.953125, + "step": 2318, + "think_completion_length": 40.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.96875, + "epoch": 3.915682967959528, + "grad_norm": 15.939587317660832, + "kl": 0.611328125, + "learning_rate": 2.178752107925801e-07, + "loss": 0.0006, + "reward": 3.709532141685486, + "reward_std": 0.0666387677192688, + "rewards/final_reward": 1.6968465215991122, + "rewards/mask_iou_reward": 0.8484232607995561, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7095322012901306, + "rewards/thk_ans_format_reward": 1.0, + "step": 2319, + "think_completion_length": 46.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.71875, + "epoch": 3.917369308600337, + "grad_norm": 5.404933794280482, + "kl": 0.650390625, + "learning_rate": 2.1753794266441818e-07, + "loss": 0.0007, + "reward": 3.586440086364746, + "reward_std": 0.16442099958658218, + "rewards/final_reward": 1.5678774901850385, + "rewards/mask_iou_reward": 0.7839387450925193, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5864400267601013, + "rewards/thk_ans_format_reward": 1.0, + "step": 2320, + "think_completion_length": 42.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.25, + "epoch": 3.919055649241147, + "grad_norm": 13.873493009868913, + "kl": 0.509765625, + "learning_rate": 2.1720067453625632e-07, + "loss": 0.0005, + "reward": 3.51028573513031, + "reward_std": 0.04750672448426485, + "rewards/final_reward": 1.8690834462785189, + "rewards/mask_iou_reward": 0.9345417231392594, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.510285496711731, + "rewards/thk_ans_format_reward": 1.0, + "step": 2321, + "think_completion_length": 49.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.0625, + "epoch": 3.920741989881956, + "grad_norm": 7.491766455383782, + "kl": 0.48046875, + "learning_rate": 2.1686340640809443e-07, + "loss": 0.0005, + "reward": 2.8261446952819824, + "reward_std": 0.4372350126504898, + "rewards/final_reward": 0.760745959984364, + "rewards/mask_iou_reward": 0.380372979992182, + "rewards/sam_format_reward": 0.9375, + "rewards/sam_reward_func_ultra": 0.9355196356773376, + "rewards/thk_ans_format_reward": 0.953125, + "step": 2322, + "think_completion_length": 51.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.890625, + "epoch": 3.9224283305227656, + "grad_norm": 8.40222217416887, + "kl": 0.53515625, + "learning_rate": 2.1652613827993254e-07, + "loss": 0.0005, + "reward": 3.835439443588257, + "reward_std": 0.06295907869935036, + "rewards/final_reward": 1.7956223026772338, + "rewards/mask_iou_reward": 0.8978111513386169, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8354395031929016, + "rewards/thk_ans_format_reward": 1.0, + "step": 2323, + "think_completion_length": 40.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.890625, + "epoch": 3.924114671163575, + "grad_norm": 4.534313067244203, + "kl": 0.548828125, + "learning_rate": 2.1618887015177066e-07, + "loss": 0.0005, + "reward": 3.424034357070923, + "reward_std": 0.10503194469492882, + "rewards/final_reward": 0.9665427966019128, + "rewards/mask_iou_reward": 0.4832713983009564, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4240343272686005, + "rewards/thk_ans_format_reward": 1.0, + "step": 2324, + "think_completion_length": 39.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.96875, + "epoch": 3.9258010118043845, + "grad_norm": 5.49761755891851, + "kl": 0.5283203125, + "learning_rate": 2.1585160202360875e-07, + "loss": 0.0005, + "reward": 3.173019528388977, + "reward_std": 0.19381612539291382, + "rewards/final_reward": 1.1044718489811443, + "rewards/mask_iou_reward": 0.5522359244905721, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1730196475982666, + "rewards/thk_ans_format_reward": 1.0, + "step": 2325, + "think_completion_length": 45.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.90625, + "epoch": 3.927487352445194, + "grad_norm": 10.577534335724108, + "kl": 0.5390625, + "learning_rate": 2.1551433389544689e-07, + "loss": 0.0005, + "reward": 3.248093843460083, + "reward_std": 0.06915931031107903, + "rewards/final_reward": 1.10379462136804, + "rewards/mask_iou_reward": 0.55189731068402, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2480938136577606, + "rewards/thk_ans_format_reward": 1.0, + "step": 2326, + "think_completion_length": 40.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.84375, + "epoch": 3.9291736930860033, + "grad_norm": 17.10393035194125, + "kl": 0.5703125, + "learning_rate": 2.1517706576728497e-07, + "loss": 0.0006, + "reward": 3.4472368955612183, + "reward_std": 0.23830869793891907, + "rewards/final_reward": 1.4801394408547965, + "rewards/mask_iou_reward": 0.7400697204273983, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.447236955165863, + "rewards/thk_ans_format_reward": 1.0, + "step": 2327, + "think_completion_length": 42.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.0, + "epoch": 3.930860033726813, + "grad_norm": 26.239557118806275, + "kl": 0.517578125, + "learning_rate": 2.1483979763912311e-07, + "loss": 0.0005, + "reward": 3.143664836883545, + "reward_std": 0.11279793456196785, + "rewards/final_reward": 1.392644433981519, + "rewards/mask_iou_reward": 0.6963222169907595, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.143664836883545, + "rewards/thk_ans_format_reward": 1.0, + "step": 2328, + "think_completion_length": 41.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.71875, + "epoch": 3.932546374367622, + "grad_norm": 7.215896642060539, + "kl": 0.501953125, + "learning_rate": 2.145025295109612e-07, + "loss": 0.0005, + "reward": 3.80619215965271, + "reward_std": 0.05122208781540394, + "rewards/final_reward": 1.8444367542741427, + "rewards/mask_iou_reward": 0.9222183771370713, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8061923384666443, + "rewards/thk_ans_format_reward": 1.0, + "step": 2329, + "think_completion_length": 45.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.15625, + "epoch": 3.9342327150084317, + "grad_norm": 139.07716888413185, + "kl": 0.5107421875, + "learning_rate": 2.1416526138279931e-07, + "loss": 0.0005, + "reward": 3.829445242881775, + "reward_std": 0.03507534274831414, + "rewards/final_reward": 1.7253977980439543, + "rewards/mask_iou_reward": 0.8626988990219772, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8294451236724854, + "rewards/thk_ans_format_reward": 1.0, + "step": 2330, + "think_completion_length": 45.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.890625, + "epoch": 3.935919055649241, + "grad_norm": 10.157885839120912, + "kl": 0.568359375, + "learning_rate": 2.1382799325463743e-07, + "loss": 0.0006, + "reward": 3.1242516040802, + "reward_std": 0.2557784169912338, + "rewards/final_reward": 0.6962164457702434, + "rewards/mask_iou_reward": 0.3481082228851217, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1242516934871674, + "rewards/thk_ans_format_reward": 1.0, + "step": 2331, + "think_completion_length": 49.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.3125, + "epoch": 3.9376053962900506, + "grad_norm": 15.872231262992985, + "kl": 0.61328125, + "learning_rate": 2.1349072512647554e-07, + "loss": 0.0006, + "reward": 3.497292399406433, + "reward_std": 0.04093513707630336, + "rewards/final_reward": 1.8676119399339668, + "rewards/mask_iou_reward": 0.9338059699669834, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.497292399406433, + "rewards/thk_ans_format_reward": 1.0, + "step": 2332, + "think_completion_length": 42.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.046875, + "epoch": 3.93929173693086, + "grad_norm": 18.09617095066498, + "kl": 0.603515625, + "learning_rate": 2.1315345699831366e-07, + "loss": 0.0006, + "reward": 3.1687710285186768, + "reward_std": 0.0905944537371397, + "rewards/final_reward": 1.2206195223726, + "rewards/mask_iou_reward": 0.6103097611863, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1687710881233215, + "rewards/thk_ans_format_reward": 1.0, + "step": 2333, + "think_completion_length": 45.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.984375, + "epoch": 3.9409780775716694, + "grad_norm": 6.712108613719201, + "kl": 0.52734375, + "learning_rate": 2.1281618887015177e-07, + "loss": 0.0005, + "reward": 3.137063980102539, + "reward_std": 0.03329486772418022, + "rewards/final_reward": 1.110083896969302, + "rewards/mask_iou_reward": 0.555041948484651, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1370639204978943, + "rewards/thk_ans_format_reward": 1.0, + "step": 2334, + "think_completion_length": 42.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.65625, + "epoch": 3.942664418212479, + "grad_norm": 30.11300081634084, + "kl": 0.48828125, + "learning_rate": 2.1247892074198986e-07, + "loss": 0.0004, + "reward": 3.1940513849258423, + "reward_std": 0.3399582654237747, + "rewards/final_reward": 1.6859626967085002, + "rewards/mask_iou_reward": 0.8429813483542501, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1940513253211975, + "rewards/thk_ans_format_reward": 1.0, + "step": 2335, + "think_completion_length": 45.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.109375, + "epoch": 3.9443507588532882, + "grad_norm": 15.486041233379309, + "kl": 0.55859375, + "learning_rate": 2.12141652613828e-07, + "loss": 0.0006, + "reward": 3.3924100399017334, + "reward_std": 0.06121925637125969, + "rewards/final_reward": 1.5387883976333216, + "rewards/mask_iou_reward": 0.7693941988166608, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3924100399017334, + "rewards/thk_ans_format_reward": 1.0, + "step": 2336, + "think_completion_length": 52.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.53125, + "epoch": 3.946037099494098, + "grad_norm": 24.18681337716054, + "kl": 0.5703125, + "learning_rate": 2.1180438448566609e-07, + "loss": 0.0006, + "reward": 2.8781609535217285, + "reward_std": 0.08533753454685211, + "rewards/final_reward": 1.4222425338746256, + "rewards/mask_iou_reward": 0.7111212669373128, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8781610429286957, + "rewards/thk_ans_format_reward": 1.0, + "step": 2337, + "think_completion_length": 41.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.765625, + "epoch": 3.947723440134907, + "grad_norm": 13.800023998033083, + "kl": 0.79296875, + "learning_rate": 2.114671163575042e-07, + "loss": 0.0008, + "reward": 3.460109233856201, + "reward_std": 0.06012635678052902, + "rewards/final_reward": 1.5778669843797775, + "rewards/mask_iou_reward": 0.7889334921898887, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.460109293460846, + "rewards/thk_ans_format_reward": 1.0, + "step": 2338, + "think_completion_length": 47.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.265625, + "epoch": 3.9494097807757167, + "grad_norm": 7.30648588552527, + "kl": 0.58984375, + "learning_rate": 2.111298482293423e-07, + "loss": 0.0006, + "reward": 3.709952473640442, + "reward_std": 0.01341787725687027, + "rewards/final_reward": 1.9771459305174437, + "rewards/mask_iou_reward": 0.9885729652587218, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7099525332450867, + "rewards/thk_ans_format_reward": 1.0, + "step": 2339, + "think_completion_length": 45.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.03125, + "epoch": 3.9510961214165263, + "grad_norm": 10.580802313832068, + "kl": 0.70703125, + "learning_rate": 2.1079258010118043e-07, + "loss": 0.0007, + "reward": 3.5735737085342407, + "reward_std": 0.05377620831131935, + "rewards/final_reward": 1.7095786501825816, + "rewards/mask_iou_reward": 0.8547893250912908, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5735737085342407, + "rewards/thk_ans_format_reward": 1.0, + "step": 2340, + "think_completion_length": 45.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.703125, + "epoch": 3.9527824620573355, + "grad_norm": 5.796140618878087, + "kl": 0.453125, + "learning_rate": 2.1045531197301854e-07, + "loss": 0.0003, + "reward": 2.8115952014923096, + "reward_std": 0.059875136241316795, + "rewards/final_reward": 1.033261149128388, + "rewards/mask_iou_reward": 0.516630574564194, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.811595231294632, + "rewards/thk_ans_format_reward": 1.0, + "step": 2341, + "think_completion_length": 44.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.609375, + "epoch": 3.954468802698145, + "grad_norm": 7.097882097442878, + "kl": 0.541015625, + "learning_rate": 2.1011804384485665e-07, + "loss": 0.0005, + "reward": 3.1771254539489746, + "reward_std": 0.14247756265103817, + "rewards/final_reward": 0.6251058696369434, + "rewards/mask_iou_reward": 0.3125529348184717, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1771255135536194, + "rewards/thk_ans_format_reward": 1.0, + "step": 2342, + "think_completion_length": 51.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.484375, + "epoch": 3.9561551433389543, + "grad_norm": 9.763072265442423, + "kl": 0.556640625, + "learning_rate": 2.0978077571669474e-07, + "loss": 0.0006, + "reward": 3.623378276824951, + "reward_std": 0.015420469455420971, + "rewards/final_reward": 1.4593983815876321, + "rewards/mask_iou_reward": 0.7296991907938161, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6233782768249512, + "rewards/thk_ans_format_reward": 1.0, + "step": 2343, + "think_completion_length": 48.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.734375, + "epoch": 3.957841483979764, + "grad_norm": 12.010130402601112, + "kl": 0.560546875, + "learning_rate": 2.0944350758853288e-07, + "loss": 0.0006, + "reward": 3.293154716491699, + "reward_std": 0.1442108228802681, + "rewards/final_reward": 1.1585732875079058, + "rewards/mask_iou_reward": 0.5792866437539529, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2931545972824097, + "rewards/thk_ans_format_reward": 1.0, + "step": 2344, + "think_completion_length": 46.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.3125, + "epoch": 3.959527824620573, + "grad_norm": 19.952532610141517, + "kl": 0.55078125, + "learning_rate": 2.0910623946037097e-07, + "loss": 0.0005, + "reward": 3.4306150674819946, + "reward_std": 0.19564368575811386, + "rewards/final_reward": 1.5140829556181865, + "rewards/mask_iou_reward": 0.7570414778090933, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4306151866912842, + "rewards/thk_ans_format_reward": 1.0, + "step": 2345, + "think_completion_length": 45.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.90625, + "epoch": 3.961214165261383, + "grad_norm": 6.506383412589879, + "kl": 0.615234375, + "learning_rate": 2.087689713322091e-07, + "loss": 0.0006, + "reward": 3.2742691040039062, + "reward_std": 0.38428041338920593, + "rewards/final_reward": 1.3280366253202556, + "rewards/mask_iou_reward": 0.6640183126601278, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2742691040039062, + "rewards/thk_ans_format_reward": 1.0, + "step": 2346, + "think_completion_length": 44.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.75, + "epoch": 3.9629005059021924, + "grad_norm": 12.84224251522735, + "kl": 0.625, + "learning_rate": 2.084317032040472e-07, + "loss": 0.0006, + "reward": 3.4609274864196777, + "reward_std": 0.07086838409304619, + "rewards/final_reward": 1.5369746382770562, + "rewards/mask_iou_reward": 0.7684873191385281, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4609274864196777, + "rewards/thk_ans_format_reward": 1.0, + "step": 2347, + "think_completion_length": 42.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.171875, + "epoch": 3.9645868465430016, + "grad_norm": 7.010148238756001, + "kl": 0.541015625, + "learning_rate": 2.080944350758853e-07, + "loss": 0.0005, + "reward": 3.171350598335266, + "reward_std": 0.22759989090263844, + "rewards/final_reward": 0.6685336459687008, + "rewards/mask_iou_reward": 0.3342668229843504, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.171350359916687, + "rewards/thk_ans_format_reward": 1.0, + "step": 2348, + "think_completion_length": 45.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.40625, + "epoch": 3.9662731871838113, + "grad_norm": 8.214608147697609, + "kl": 0.548828125, + "learning_rate": 2.0775716694772345e-07, + "loss": 0.0005, + "reward": 3.204972267150879, + "reward_std": 0.11227181181311607, + "rewards/final_reward": 0.797163704136042, + "rewards/mask_iou_reward": 0.398581852068021, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2049723267555237, + "rewards/thk_ans_format_reward": 1.0, + "step": 2349, + "think_completion_length": 41.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.03125, + "epoch": 3.9679595278246205, + "grad_norm": 21.59843072432012, + "kl": 0.58984375, + "learning_rate": 2.0741989881956154e-07, + "loss": 0.0006, + "reward": 3.084683656692505, + "reward_std": 0.12124911695718765, + "rewards/final_reward": 1.2331499388594027, + "rewards/mask_iou_reward": 0.6165749694297014, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.100308820605278, + "rewards/thk_ans_format_reward": 0.984375, + "step": 2350, + "think_completion_length": 49.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.1875, + "epoch": 3.96964586846543, + "grad_norm": 33.7028917585473, + "kl": 0.55859375, + "learning_rate": 2.0708263069139965e-07, + "loss": 0.0005, + "reward": 3.4958006143569946, + "reward_std": 0.12888287706300616, + "rewards/final_reward": 1.1570881066591203, + "rewards/mask_iou_reward": 0.5785440533295602, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4958006739616394, + "rewards/thk_ans_format_reward": 1.0, + "step": 2351, + "think_completion_length": 45.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 222.34375, + "epoch": 3.9713322091062393, + "grad_norm": 16.717874723781456, + "kl": 0.5, + "learning_rate": 2.0674536256323777e-07, + "loss": 0.0005, + "reward": 3.160848021507263, + "reward_std": 0.3593662567436695, + "rewards/final_reward": 1.1463538950898486, + "rewards/mask_iou_reward": 0.5731769475449243, + "rewards/sam_format_reward": 0.953125, + "rewards/sam_reward_func_ultra": 1.254598081111908, + "rewards/thk_ans_format_reward": 0.953125, + "step": 2352, + "think_completion_length": 53.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.109375, + "epoch": 3.973018549747049, + "grad_norm": 9.90686934332323, + "kl": 0.53515625, + "learning_rate": 2.0640809443507588e-07, + "loss": 0.0005, + "reward": 3.0223071575164795, + "reward_std": 0.05929320678114891, + "rewards/final_reward": 1.237988610147628, + "rewards/mask_iou_reward": 0.618994305073814, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0223073363304138, + "rewards/thk_ans_format_reward": 1.0, + "step": 2353, + "think_completion_length": 51.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.171875, + "epoch": 3.9747048903878586, + "grad_norm": 7.787093798034208, + "kl": 0.578125, + "learning_rate": 2.06070826306914e-07, + "loss": 0.0006, + "reward": 3.5822603702545166, + "reward_std": 0.060061621479690075, + "rewards/final_reward": 1.4014618811568327, + "rewards/mask_iou_reward": 0.7007309405784163, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5822604298591614, + "rewards/thk_ans_format_reward": 1.0, + "step": 2354, + "think_completion_length": 47.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.734375, + "epoch": 3.9763912310286678, + "grad_norm": 36.662165992532785, + "kl": 0.580078125, + "learning_rate": 2.057335581787521e-07, + "loss": 0.0006, + "reward": 2.6705543994903564, + "reward_std": 0.11838686466217041, + "rewards/final_reward": 0.5942330853210306, + "rewards/mask_iou_reward": 0.2971165426605153, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6705543994903564, + "rewards/thk_ans_format_reward": 1.0, + "step": 2355, + "think_completion_length": 41.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.46875, + "epoch": 3.9780775716694774, + "grad_norm": 5.048215872579047, + "kl": 0.53125, + "learning_rate": 2.053962900505902e-07, + "loss": 0.0005, + "reward": 3.039967894554138, + "reward_std": 0.16621370613574982, + "rewards/final_reward": 0.9111258281079331, + "rewards/mask_iou_reward": 0.45556291405396654, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0399678200483322, + "rewards/thk_ans_format_reward": 1.0, + "step": 2356, + "think_completion_length": 47.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.296875, + "epoch": 3.9797639123102866, + "grad_norm": 5.680459626682978, + "kl": 0.552734375, + "learning_rate": 2.0505902192242834e-07, + "loss": 0.0006, + "reward": 3.610256791114807, + "reward_std": 0.15282126516103745, + "rewards/final_reward": 1.2514259912562973, + "rewards/mask_iou_reward": 0.6257129956281486, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6102567911148071, + "rewards/thk_ans_format_reward": 1.0, + "step": 2357, + "think_completion_length": 41.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.640625, + "epoch": 3.9814502529510962, + "grad_norm": 50.9202273375314, + "kl": 0.490234375, + "learning_rate": 2.0472175379426642e-07, + "loss": 0.0005, + "reward": 3.60845947265625, + "reward_std": 0.14291292056441307, + "rewards/final_reward": 1.3316400335536596, + "rewards/mask_iou_reward": 0.6658200167768298, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6084596514701843, + "rewards/thk_ans_format_reward": 1.0, + "step": 2358, + "think_completion_length": 41.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.625, + "epoch": 3.9831365935919054, + "grad_norm": 6.8048835926311355, + "kl": 0.5703125, + "learning_rate": 2.0438448566610456e-07, + "loss": 0.0006, + "reward": 3.695403814315796, + "reward_std": 0.05196426110342145, + "rewards/final_reward": 1.4792172508409012, + "rewards/mask_iou_reward": 0.7396086254204506, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6954036355018616, + "rewards/thk_ans_format_reward": 1.0, + "step": 2359, + "think_completion_length": 45.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.640625, + "epoch": 3.984822934232715, + "grad_norm": 7.436063431534893, + "kl": 0.623046875, + "learning_rate": 2.0404721753794265e-07, + "loss": 0.0006, + "reward": 3.3029123544692993, + "reward_std": 0.19523370638489723, + "rewards/final_reward": 1.8108300410136204, + "rewards/mask_iou_reward": 0.9054150205068102, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3029123842716217, + "rewards/thk_ans_format_reward": 1.0, + "step": 2360, + "think_completion_length": 46.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.71875, + "epoch": 3.9865092748735247, + "grad_norm": 11.427629433570743, + "kl": 0.6171875, + "learning_rate": 2.0370994940978076e-07, + "loss": 0.0006, + "reward": 3.600027322769165, + "reward_std": 0.035907904617488384, + "rewards/final_reward": 1.8624177284520789, + "rewards/mask_iou_reward": 0.9312088642260394, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.600027322769165, + "rewards/thk_ans_format_reward": 1.0, + "step": 2361, + "think_completion_length": 44.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.015625, + "epoch": 3.988195615514334, + "grad_norm": 6.780449514230903, + "kl": 0.640625, + "learning_rate": 2.0337268128161888e-07, + "loss": 0.0006, + "reward": 3.4493885040283203, + "reward_std": 0.05858028307557106, + "rewards/final_reward": 1.458034595134662, + "rewards/mask_iou_reward": 0.729017297567331, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4493885040283203, + "rewards/thk_ans_format_reward": 1.0, + "step": 2362, + "think_completion_length": 42.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.6875, + "epoch": 3.989881956155143, + "grad_norm": 8.232895015262699, + "kl": 0.58203125, + "learning_rate": 2.03035413153457e-07, + "loss": 0.0006, + "reward": 3.7412610054016113, + "reward_std": 0.07890792051330209, + "rewards/final_reward": 1.6300552338040415, + "rewards/mask_iou_reward": 0.8150276169020207, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7412610054016113, + "rewards/thk_ans_format_reward": 1.0, + "step": 2363, + "think_completion_length": 45.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.28125, + "epoch": 3.9915682967959527, + "grad_norm": 13.791693880711723, + "kl": 0.6015625, + "learning_rate": 2.026981450252951e-07, + "loss": 0.0006, + "reward": 3.418661952018738, + "reward_std": 0.0330036785453558, + "rewards/final_reward": 1.1440329602519095, + "rewards/mask_iou_reward": 0.5720164801259547, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4186618328094482, + "rewards/thk_ans_format_reward": 1.0, + "step": 2364, + "think_completion_length": 39.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.0, + "epoch": 3.9932546374367623, + "grad_norm": 21.579353241651173, + "kl": 0.6796875, + "learning_rate": 2.0236087689713322e-07, + "loss": 0.0006, + "reward": 3.858848810195923, + "reward_std": 0.016931952442973852, + "rewards/final_reward": 1.8235267805660484, + "rewards/mask_iou_reward": 0.9117633902830242, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8588489890098572, + "rewards/thk_ans_format_reward": 1.0, + "step": 2365, + "think_completion_length": 44.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.375, + "epoch": 3.9949409780775715, + "grad_norm": 11.854854255019934, + "kl": 0.509765625, + "learning_rate": 2.020236087689713e-07, + "loss": 0.0005, + "reward": 3.8232831954956055, + "reward_std": 0.032786943775136024, + "rewards/final_reward": 1.728261051517324, + "rewards/mask_iou_reward": 0.864130525758662, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8232831954956055, + "rewards/thk_ans_format_reward": 1.0, + "step": 2366, + "think_completion_length": 46.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.5625, + "epoch": 3.996627318718381, + "grad_norm": 32.14007175303776, + "kl": 0.57421875, + "learning_rate": 2.0168634064080945e-07, + "loss": 0.0006, + "reward": 2.8483331203460693, + "reward_std": 0.015060745645314455, + "rewards/final_reward": 1.3362465005289894, + "rewards/mask_iou_reward": 0.6681232502644947, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8483331203460693, + "rewards/thk_ans_format_reward": 1.0, + "step": 2367, + "think_completion_length": 38.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.16666793823242, + "epoch": 3.998313659359191, + "grad_norm": 10.11380764879701, + "kl": 0.751953125, + "learning_rate": 2.0134907251264754e-07, + "loss": 0.0007, + "reward": 3.6402668952941895, + "reward_std": 0.021844581700861454, + "rewards/final_reward": 1.6423651190756967, + "rewards/mask_iou_reward": 0.8211825595378484, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6402669548988342, + "rewards/thk_ans_format_reward": 1.0, + "step": 2368, + "think_completion_length": 36.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.203125, + "epoch": 4.001686340640809, + "grad_norm": 11.480409829973361, + "kl": 0.6015625, + "learning_rate": 2.0101180438448565e-07, + "loss": 0.0006, + "reward": 3.5934234857559204, + "reward_std": 0.0587493684142828, + "rewards/final_reward": 1.948077071431705, + "rewards/mask_iou_reward": 0.9740385357158525, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.59342360496521, + "rewards/thk_ans_format_reward": 1.0, + "step": 2369, + "think_completion_length": 47.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.34375, + "epoch": 4.003372681281619, + "grad_norm": 8.238883165059963, + "kl": 0.55859375, + "learning_rate": 2.0067453625632376e-07, + "loss": 0.0006, + "reward": 3.468377709388733, + "reward_std": 0.060116853564977646, + "rewards/final_reward": 1.3180356249804, + "rewards/mask_iou_reward": 0.6590178124902, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4683776497840881, + "rewards/thk_ans_format_reward": 1.0, + "step": 2370, + "think_completion_length": 38.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.515625, + "epoch": 4.0050590219224285, + "grad_norm": 12.92183070167942, + "kl": 0.56640625, + "learning_rate": 2.0033726812816188e-07, + "loss": 0.0006, + "reward": 3.150872826576233, + "reward_std": 0.10184543719515204, + "rewards/final_reward": 0.9877310882947764, + "rewards/mask_iou_reward": 0.4938655441473882, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1508729159832, + "rewards/thk_ans_format_reward": 1.0, + "step": 2371, + "think_completion_length": 44.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.28125, + "epoch": 4.006745362563238, + "grad_norm": 6.22318773334404, + "kl": 0.5615234375, + "learning_rate": 2e-07, + "loss": 0.0006, + "reward": 3.3521891832351685, + "reward_std": 0.06817868165671825, + "rewards/final_reward": 1.4573340237162564, + "rewards/mask_iou_reward": 0.7286670118581282, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3521891832351685, + "rewards/thk_ans_format_reward": 1.0, + "step": 2372, + "think_completion_length": 42.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.140625, + "epoch": 4.008431703204047, + "grad_norm": 6.22020827034915, + "kl": 0.53515625, + "learning_rate": 1.996627318718381e-07, + "loss": 0.0005, + "reward": 3.731974720954895, + "reward_std": 0.0071526761166751385, + "rewards/final_reward": 1.5606302272620618, + "rewards/mask_iou_reward": 0.7803151136310309, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7319748997688293, + "rewards/thk_ans_format_reward": 1.0, + "step": 2373, + "think_completion_length": 46.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.359375, + "epoch": 4.010118043844857, + "grad_norm": 15.966196109750083, + "kl": 0.5546875, + "learning_rate": 1.993254637436762e-07, + "loss": 0.0005, + "reward": 3.180173635482788, + "reward_std": 0.2132977396249771, + "rewards/final_reward": 1.401960101621973, + "rewards/mask_iou_reward": 0.7009800508109865, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.195798546075821, + "rewards/thk_ans_format_reward": 1.0, + "step": 2374, + "think_completion_length": 42.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.828125, + "epoch": 4.011804384485666, + "grad_norm": 8.591836875153847, + "kl": 0.578125, + "learning_rate": 1.9898819561551433e-07, + "loss": 0.0006, + "reward": 3.532989501953125, + "reward_std": 0.14260661602020264, + "rewards/final_reward": 1.8065476158177485, + "rewards/mask_iou_reward": 0.9032738079088742, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.532989501953125, + "rewards/thk_ans_format_reward": 1.0, + "step": 2375, + "think_completion_length": 48.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.4375, + "epoch": 4.013490725126475, + "grad_norm": 6.403629080049531, + "kl": 0.521484375, + "learning_rate": 1.9865092748735242e-07, + "loss": 0.0006, + "reward": 2.971209168434143, + "reward_std": 0.059162041172385216, + "rewards/final_reward": 0.8289285008547014, + "rewards/mask_iou_reward": 0.4144642504273507, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9712091982364655, + "rewards/thk_ans_format_reward": 1.0, + "step": 2376, + "think_completion_length": 42.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.015625, + "epoch": 4.015177065767285, + "grad_norm": 6.753527939628968, + "kl": 0.546875, + "learning_rate": 1.9831365935919056e-07, + "loss": 0.0006, + "reward": 3.7322280406951904, + "reward_std": 0.2169511088868603, + "rewards/final_reward": 1.5525031955747912, + "rewards/mask_iou_reward": 0.7762515977873956, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7322279810905457, + "rewards/thk_ans_format_reward": 1.0, + "step": 2377, + "think_completion_length": 45.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.359375, + "epoch": 4.016863406408095, + "grad_norm": 18.263143894412075, + "kl": 0.611328125, + "learning_rate": 1.9797639123102867e-07, + "loss": 0.0006, + "reward": 3.4608668088912964, + "reward_std": 0.04524455638602376, + "rewards/final_reward": 1.5373226020513886, + "rewards/mask_iou_reward": 0.7686613010256943, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4608668088912964, + "rewards/thk_ans_format_reward": 1.0, + "step": 2378, + "think_completion_length": 42.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.28125, + "epoch": 4.018549747048904, + "grad_norm": 487.8030454095207, + "kl": 54.775390625, + "learning_rate": 1.9763912310286676e-07, + "loss": 0.0547, + "reward": 3.39677631855011, + "reward_std": 0.017723735887557268, + "rewards/final_reward": 0.8353646910599449, + "rewards/mask_iou_reward": 0.41768234552997247, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3967764377593994, + "rewards/thk_ans_format_reward": 1.0, + "step": 2379, + "think_completion_length": 48.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.328125, + "epoch": 4.020236087689713, + "grad_norm": 11.546597629218004, + "kl": 0.63671875, + "learning_rate": 1.973018549747049e-07, + "loss": 0.0006, + "reward": 3.2893166542053223, + "reward_std": 0.20702505111694336, + "rewards/final_reward": 1.1057080017261876, + "rewards/mask_iou_reward": 0.5528540008630938, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2893165946006775, + "rewards/thk_ans_format_reward": 1.0, + "step": 2380, + "think_completion_length": 35.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.5625, + "epoch": 4.021922428330523, + "grad_norm": 12.940481833874944, + "kl": 0.591796875, + "learning_rate": 1.96964586846543e-07, + "loss": 0.0006, + "reward": 3.1833547353744507, + "reward_std": 0.01602939050644636, + "rewards/final_reward": 0.7768981750521698, + "rewards/mask_iou_reward": 0.3884490875260849, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1833546161651611, + "rewards/thk_ans_format_reward": 1.0, + "step": 2381, + "think_completion_length": 42.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.640625, + "epoch": 4.023608768971332, + "grad_norm": 37.607564987993854, + "kl": 0.578125, + "learning_rate": 1.966273187183811e-07, + "loss": 0.0006, + "reward": 3.6645880937576294, + "reward_std": 0.05637491028755903, + "rewards/final_reward": 1.4050240383685577, + "rewards/mask_iou_reward": 0.7025120191842789, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.664588212966919, + "rewards/thk_ans_format_reward": 1.0, + "step": 2382, + "think_completion_length": 41.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.28125, + "epoch": 4.025295109612141, + "grad_norm": 11.439961980420474, + "kl": 0.548828125, + "learning_rate": 1.9629005059021922e-07, + "loss": 0.0005, + "reward": 3.040262460708618, + "reward_std": 0.0750212837010622, + "rewards/final_reward": 0.9258982103314888, + "rewards/mask_iou_reward": 0.4629491051657444, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0402624607086182, + "rewards/thk_ans_format_reward": 1.0, + "step": 2383, + "think_completion_length": 44.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.890625, + "epoch": 4.0269814502529515, + "grad_norm": 4.516035931232741, + "kl": 0.48828125, + "learning_rate": 1.9595278246205733e-07, + "loss": 0.0005, + "reward": 3.7392314672470093, + "reward_std": 0.1627311073243618, + "rewards/final_reward": 1.717044571434485, + "rewards/mask_iou_reward": 0.8585222857172425, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7392314672470093, + "rewards/thk_ans_format_reward": 1.0, + "step": 2384, + "think_completion_length": 44.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.90625, + "epoch": 4.028667790893761, + "grad_norm": 19.841883618522072, + "kl": 0.568359375, + "learning_rate": 1.9561551433389544e-07, + "loss": 0.0006, + "reward": 2.8191603422164917, + "reward_std": 0.041978662833571434, + "rewards/final_reward": 0.7025187247498247, + "rewards/mask_iou_reward": 0.35125936237491234, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8191602826118469, + "rewards/thk_ans_format_reward": 1.0, + "step": 2385, + "think_completion_length": 43.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.359375, + "epoch": 4.03035413153457, + "grad_norm": 7.039285605112933, + "kl": 0.58984375, + "learning_rate": 1.9527824620573356e-07, + "loss": 0.0006, + "reward": 3.5875240564346313, + "reward_std": 0.3093913681805134, + "rewards/final_reward": 1.5723538136182496, + "rewards/mask_iou_reward": 0.7861769068091248, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.587524175643921, + "rewards/thk_ans_format_reward": 1.0, + "step": 2386, + "think_completion_length": 43.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.90625, + "epoch": 4.032040472175379, + "grad_norm": 12.816288500352174, + "kl": 0.5859375, + "learning_rate": 1.9494097807757165e-07, + "loss": 0.0006, + "reward": 2.8287243843078613, + "reward_std": 0.5208190828561783, + "rewards/final_reward": 0.7635315952793571, + "rewards/mask_iou_reward": 0.38176579763967855, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 0.8912242650985718, + "rewards/thk_ans_format_reward": 0.96875, + "step": 2387, + "think_completion_length": 40.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.328125, + "epoch": 4.033726812816189, + "grad_norm": 25.36677976502114, + "kl": 0.53125, + "learning_rate": 1.9460370994940979e-07, + "loss": 0.0005, + "reward": 3.5308085680007935, + "reward_std": 0.060908347368240356, + "rewards/final_reward": 1.7344693208015038, + "rewards/mask_iou_reward": 0.8672346604007519, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5308085680007935, + "rewards/thk_ans_format_reward": 1.0, + "step": 2388, + "think_completion_length": 45.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.6875, + "epoch": 4.035413153456998, + "grad_norm": 14.45270041695548, + "kl": 0.90234375, + "learning_rate": 1.9426644182124787e-07, + "loss": 0.0009, + "reward": 3.292656898498535, + "reward_std": 0.13113961927592754, + "rewards/final_reward": 1.2586338570953175, + "rewards/mask_iou_reward": 0.6293169285476587, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2926568984985352, + "rewards/thk_ans_format_reward": 1.0, + "step": 2389, + "think_completion_length": 46.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.484375, + "epoch": 4.0370994940978076, + "grad_norm": 7.587921954479646, + "kl": 0.5048828125, + "learning_rate": 1.9392917369308601e-07, + "loss": 0.0005, + "reward": 3.715697169303894, + "reward_std": 0.11376471444964409, + "rewards/final_reward": 1.812208194310015, + "rewards/mask_iou_reward": 0.9061040971550075, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7156970500946045, + "rewards/thk_ans_format_reward": 1.0, + "step": 2390, + "think_completion_length": 36.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.734375, + "epoch": 4.038785834738618, + "grad_norm": 26.996972484909367, + "kl": 0.55859375, + "learning_rate": 1.935919055649241e-07, + "loss": 0.0006, + "reward": 3.1315231323242188, + "reward_std": 0.1370735252276063, + "rewards/final_reward": 0.8673822408615481, + "rewards/mask_iou_reward": 0.43369112043077407, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.131523072719574, + "rewards/thk_ans_format_reward": 1.0, + "step": 2391, + "think_completion_length": 44.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.59375, + "epoch": 4.040472175379427, + "grad_norm": 7.6753970617393, + "kl": 0.544921875, + "learning_rate": 1.9325463743676222e-07, + "loss": 0.0005, + "reward": 3.4671186208724976, + "reward_std": 0.04770086891949177, + "rewards/final_reward": 1.8936233515745924, + "rewards/mask_iou_reward": 0.9468116757872962, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4671186208724976, + "rewards/thk_ans_format_reward": 1.0, + "step": 2392, + "think_completion_length": 42.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.4375, + "epoch": 4.042158516020236, + "grad_norm": 8.931034431762196, + "kl": 0.578125, + "learning_rate": 1.9291736930860033e-07, + "loss": 0.0006, + "reward": 3.7695319652557373, + "reward_std": 0.21818761248141527, + "rewards/final_reward": 1.7540229682748367, + "rewards/mask_iou_reward": 0.8770114841374184, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7695319652557373, + "rewards/thk_ans_format_reward": 1.0, + "step": 2393, + "think_completion_length": 43.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.484375, + "epoch": 4.043844856661045, + "grad_norm": 12.924364201978555, + "kl": 1.615234375, + "learning_rate": 1.9258010118043844e-07, + "loss": 0.0016, + "reward": 3.437775492668152, + "reward_std": 0.25223083049058914, + "rewards/final_reward": 1.2896175495769646, + "rewards/mask_iou_reward": 0.6448087747884823, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.437775433063507, + "rewards/thk_ans_format_reward": 1.0, + "step": 2394, + "think_completion_length": 42.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.640625, + "epoch": 4.045531197301855, + "grad_norm": 23.626398444759594, + "kl": 0.56640625, + "learning_rate": 1.9224283305227653e-07, + "loss": 0.0006, + "reward": 3.8039190769195557, + "reward_std": 0.00993341370485723, + "rewards/final_reward": 1.8160529556955798, + "rewards/mask_iou_reward": 0.9080264778477899, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8039190769195557, + "rewards/thk_ans_format_reward": 1.0, + "step": 2395, + "think_completion_length": 39.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.609375, + "epoch": 4.0472175379426645, + "grad_norm": 10.156897671568798, + "kl": 0.55859375, + "learning_rate": 1.9190556492411467e-07, + "loss": 0.0006, + "reward": 3.070542812347412, + "reward_std": 0.06726253964006901, + "rewards/final_reward": 0.5538109703120524, + "rewards/mask_iou_reward": 0.2769054851560262, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0705427527427673, + "rewards/thk_ans_format_reward": 1.0, + "step": 2396, + "think_completion_length": 41.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.109375, + "epoch": 4.048903878583474, + "grad_norm": 5.564434290948505, + "kl": 0.623046875, + "learning_rate": 1.9156829679595276e-07, + "loss": 0.0006, + "reward": 3.2643067836761475, + "reward_std": 0.1904737390577793, + "rewards/final_reward": 1.0842811559956482, + "rewards/mask_iou_reward": 0.5421405779978241, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2643068432807922, + "rewards/thk_ans_format_reward": 1.0, + "step": 2397, + "think_completion_length": 40.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.140625, + "epoch": 4.050590219224283, + "grad_norm": 6.736417126637982, + "kl": 0.5703125, + "learning_rate": 1.912310286677909e-07, + "loss": 0.0006, + "reward": 3.818103551864624, + "reward_std": 0.015501199522987008, + "rewards/final_reward": 1.8088766443374351, + "rewards/mask_iou_reward": 0.9044383221687176, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8181034922599792, + "rewards/thk_ans_format_reward": 1.0, + "step": 2398, + "think_completion_length": 42.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.953125, + "epoch": 4.052276559865093, + "grad_norm": 9.671563346532743, + "kl": 0.50390625, + "learning_rate": 1.9089376053962899e-07, + "loss": 0.0005, + "reward": 3.338501453399658, + "reward_std": 0.28296706080436707, + "rewards/final_reward": 1.1677833458807307, + "rewards/mask_iou_reward": 0.5838916729403654, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3385014832019806, + "rewards/thk_ans_format_reward": 1.0, + "step": 2399, + "think_completion_length": 44.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.359375, + "epoch": 4.053962900505902, + "grad_norm": 12.746302672892734, + "kl": 0.576171875, + "learning_rate": 1.905564924114671e-07, + "loss": 0.0006, + "reward": 3.299188733100891, + "reward_std": 0.3755532205104828, + "rewards/final_reward": 1.1859492626462202, + "rewards/mask_iou_reward": 0.5929746313231101, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2991888523101807, + "rewards/thk_ans_format_reward": 1.0, + "step": 2400, + "think_completion_length": 41.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.46875, + "epoch": 4.055649241146711, + "grad_norm": 3.9789913504043333, + "kl": 0.49609375, + "learning_rate": 1.9021922428330521e-07, + "loss": 0.0005, + "reward": 3.257691979408264, + "reward_std": 0.27566526364535093, + "rewards/final_reward": 1.202514783245717, + "rewards/mask_iou_reward": 0.6012573916228585, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.288942039012909, + "rewards/thk_ans_format_reward": 0.984375, + "step": 2401, + "think_completion_length": 42.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.328125, + "epoch": 4.057335581787521, + "grad_norm": 6.355089159537488, + "kl": 0.591796875, + "learning_rate": 1.8988195615514333e-07, + "loss": 0.0006, + "reward": 3.548841118812561, + "reward_std": 0.12714591436088085, + "rewards/final_reward": 1.359696598663631, + "rewards/mask_iou_reward": 0.6798482993318155, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5488411784172058, + "rewards/thk_ans_format_reward": 1.0, + "step": 2402, + "think_completion_length": 36.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.796875, + "epoch": 4.059021922428331, + "grad_norm": 26.422139998000524, + "kl": 0.62890625, + "learning_rate": 1.8954468802698144e-07, + "loss": 0.0006, + "reward": 3.0529425144195557, + "reward_std": 0.14811351895332336, + "rewards/final_reward": 1.177692374918326, + "rewards/mask_iou_reward": 0.588846187459163, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0529426038265228, + "rewards/thk_ans_format_reward": 1.0, + "step": 2403, + "think_completion_length": 39.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.140625, + "epoch": 4.06070826306914, + "grad_norm": 40.14506228897134, + "kl": 0.58984375, + "learning_rate": 1.8920741989881955e-07, + "loss": 0.0006, + "reward": 3.2226529121398926, + "reward_std": 0.04350670985877514, + "rewards/final_reward": 1.335124546034519, + "rewards/mask_iou_reward": 0.6675622730172595, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2226530611515045, + "rewards/thk_ans_format_reward": 1.0, + "step": 2404, + "think_completion_length": 41.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.890625, + "epoch": 4.062394603709949, + "grad_norm": 61.836424536738996, + "kl": 0.609375, + "learning_rate": 1.8887015177065764e-07, + "loss": 0.0006, + "reward": 3.7547478675842285, + "reward_std": 0.054970819503068924, + "rewards/final_reward": 1.634220214433795, + "rewards/mask_iou_reward": 0.8171101072168975, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7547478675842285, + "rewards/thk_ans_format_reward": 1.0, + "step": 2405, + "think_completion_length": 40.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.671875, + "epoch": 4.064080944350759, + "grad_norm": 8.199856621555274, + "kl": 0.572265625, + "learning_rate": 1.8853288364249578e-07, + "loss": 0.0006, + "reward": 3.1624104976654053, + "reward_std": 0.19921143352985382, + "rewards/final_reward": 1.2621524469381291, + "rewards/mask_iou_reward": 0.6310762234690646, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.16241055727005, + "rewards/thk_ans_format_reward": 1.0, + "step": 2406, + "think_completion_length": 41.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.796875, + "epoch": 4.065767284991568, + "grad_norm": 9.440365557766706, + "kl": 0.548828125, + "learning_rate": 1.8819561551433387e-07, + "loss": 0.0006, + "reward": 3.2724127769470215, + "reward_std": 0.03121477458626032, + "rewards/final_reward": 1.23542375517504, + "rewards/mask_iou_reward": 0.61771187758752, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2724127173423767, + "rewards/thk_ans_format_reward": 1.0, + "step": 2407, + "think_completion_length": 41.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.078125, + "epoch": 4.0674536256323774, + "grad_norm": 27.360006415004754, + "kl": 0.572265625, + "learning_rate": 1.87858347386172e-07, + "loss": 0.0006, + "reward": 3.480781078338623, + "reward_std": 0.05732197128236294, + "rewards/final_reward": 1.324549316311801, + "rewards/mask_iou_reward": 0.6622746581559005, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.480781078338623, + "rewards/thk_ans_format_reward": 1.0, + "step": 2408, + "think_completion_length": 42.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.078125, + "epoch": 4.0691399662731875, + "grad_norm": 14.195257828449042, + "kl": 0.630859375, + "learning_rate": 1.8752107925801012e-07, + "loss": 0.0006, + "reward": 3.0159380435943604, + "reward_std": 0.11699309200048447, + "rewards/final_reward": 0.8030806539537556, + "rewards/mask_iou_reward": 0.4015403269768778, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0315630435943604, + "rewards/thk_ans_format_reward": 0.984375, + "step": 2409, + "think_completion_length": 42.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.28125, + "epoch": 4.070826306913997, + "grad_norm": 9.780196344080494, + "kl": 0.552734375, + "learning_rate": 1.871838111298482e-07, + "loss": 0.0006, + "reward": 3.4170422554016113, + "reward_std": 0.10918816924095154, + "rewards/final_reward": 1.6272089634097822, + "rewards/mask_iou_reward": 0.8136044817048911, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4170424938201904, + "rewards/thk_ans_format_reward": 1.0, + "step": 2410, + "think_completion_length": 46.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.0, + "epoch": 4.072512647554806, + "grad_norm": 10.678234098227074, + "kl": 0.61328125, + "learning_rate": 1.8684654300168635e-07, + "loss": 0.0006, + "reward": 3.7279247045516968, + "reward_std": 0.09347648662514985, + "rewards/final_reward": 1.619307008372024, + "rewards/mask_iou_reward": 0.809653504186012, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7279247045516968, + "rewards/thk_ans_format_reward": 1.0, + "step": 2411, + "think_completion_length": 39.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.609375, + "epoch": 4.074198988195615, + "grad_norm": 10.855980004734976, + "kl": 0.59375, + "learning_rate": 1.8650927487352444e-07, + "loss": 0.0006, + "reward": 3.362097144126892, + "reward_std": 0.1303092995658517, + "rewards/final_reward": 1.6823154227344528, + "rewards/mask_iou_reward": 0.8411577113672264, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3620972633361816, + "rewards/thk_ans_format_reward": 1.0, + "step": 2412, + "think_completion_length": 38.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.109375, + "epoch": 4.075885328836425, + "grad_norm": 15.421168686239545, + "kl": 0.58984375, + "learning_rate": 1.8617200674536255e-07, + "loss": 0.0006, + "reward": 3.3645206689834595, + "reward_std": 0.248811274766922, + "rewards/final_reward": 1.2371536458376557, + "rewards/mask_iou_reward": 0.6185768229188279, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3645207285881042, + "rewards/thk_ans_format_reward": 1.0, + "step": 2413, + "think_completion_length": 42.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.890625, + "epoch": 4.077571669477234, + "grad_norm": 25.17182493148204, + "kl": 0.529296875, + "learning_rate": 1.8583473861720067e-07, + "loss": 0.0006, + "reward": 3.806470274925232, + "reward_std": 0.0033699345076456666, + "rewards/final_reward": 1.8741425356650039, + "rewards/mask_iou_reward": 0.9370712678325019, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8064703345298767, + "rewards/thk_ans_format_reward": 1.0, + "step": 2414, + "think_completion_length": 42.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.84375, + "epoch": 4.079258010118044, + "grad_norm": 10.074641437628669, + "kl": 0.5703125, + "learning_rate": 1.8549747048903878e-07, + "loss": 0.0006, + "reward": 3.4285424947738647, + "reward_std": 0.02089158445596695, + "rewards/final_reward": 1.782670384576963, + "rewards/mask_iou_reward": 0.8913351922884815, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4285423755645752, + "rewards/thk_ans_format_reward": 1.0, + "step": 2415, + "think_completion_length": 43.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.65625, + "epoch": 4.080944350758854, + "grad_norm": 9.826004226753085, + "kl": 0.580078125, + "learning_rate": 1.851602023608769e-07, + "loss": 0.0006, + "reward": 3.0336287021636963, + "reward_std": 0.09551212098449469, + "rewards/final_reward": 0.7605734952695639, + "rewards/mask_iou_reward": 0.38028674763478193, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0336288213729858, + "rewards/thk_ans_format_reward": 1.0, + "step": 2416, + "think_completion_length": 45.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.921875, + "epoch": 4.082630691399663, + "grad_norm": 6.860726849627503, + "kl": 0.552734375, + "learning_rate": 1.84822934232715e-07, + "loss": 0.0006, + "reward": 3.3313989639282227, + "reward_std": 0.06857777805998921, + "rewards/final_reward": 1.7899774872625613, + "rewards/mask_iou_reward": 0.8949887436312807, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.331398993730545, + "rewards/thk_ans_format_reward": 1.0, + "step": 2417, + "think_completion_length": 42.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.484375, + "epoch": 4.084317032040472, + "grad_norm": 12.505909690671647, + "kl": 0.533203125, + "learning_rate": 1.844856661045531e-07, + "loss": 0.0005, + "reward": 3.3965485095977783, + "reward_std": 0.3861938640475273, + "rewards/final_reward": 1.1030617092141284, + "rewards/mask_iou_reward": 0.5515308546070642, + "rewards/sam_format_reward": 0.953125, + "rewards/sam_reward_func_ultra": 1.505923569202423, + "rewards/thk_ans_format_reward": 0.9375, + "step": 2418, + "think_completion_length": 46.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.765625, + "epoch": 4.086003372681281, + "grad_norm": 7.129779888498348, + "kl": 0.625, + "learning_rate": 1.8414839797639124e-07, + "loss": 0.0006, + "reward": 3.7287741899490356, + "reward_std": 0.2574765458703041, + "rewards/final_reward": 1.6428350439463504, + "rewards/mask_iou_reward": 0.8214175219731752, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7287741303443909, + "rewards/thk_ans_format_reward": 1.0, + "step": 2419, + "think_completion_length": 41.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.328125, + "epoch": 4.087689713322091, + "grad_norm": 9.125420264391623, + "kl": 0.546875, + "learning_rate": 1.8381112984822932e-07, + "loss": 0.0005, + "reward": 3.340920567512512, + "reward_std": 0.35566695034503937, + "rewards/final_reward": 0.8635012586914437, + "rewards/mask_iou_reward": 0.43175062934572184, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3409205675125122, + "rewards/thk_ans_format_reward": 1.0, + "step": 2420, + "think_completion_length": 41.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.25, + "epoch": 4.0893760539629005, + "grad_norm": 8.219293003863777, + "kl": 0.603515625, + "learning_rate": 1.8347386172006746e-07, + "loss": 0.0006, + "reward": 2.8761707544326782, + "reward_std": 0.06319956108927727, + "rewards/final_reward": 0.8135975983025594, + "rewards/mask_iou_reward": 0.4067987991512797, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8761707842350006, + "rewards/thk_ans_format_reward": 1.0, + "step": 2421, + "think_completion_length": 41.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.859375, + "epoch": 4.09106239460371, + "grad_norm": 11.65638932013057, + "kl": 0.544921875, + "learning_rate": 1.8313659359190555e-07, + "loss": 0.0005, + "reward": 3.4241864681243896, + "reward_std": 0.13773788139224052, + "rewards/final_reward": 1.3498022886924497, + "rewards/mask_iou_reward": 0.6749011443462248, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4241865277290344, + "rewards/thk_ans_format_reward": 1.0, + "step": 2422, + "think_completion_length": 41.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.6875, + "epoch": 4.09274873524452, + "grad_norm": 11.906764558427954, + "kl": 0.60546875, + "learning_rate": 1.8279932546374367e-07, + "loss": 0.0006, + "reward": 3.194682478904724, + "reward_std": 0.2606370970606804, + "rewards/final_reward": 1.521790364090617, + "rewards/mask_iou_reward": 0.7608951820453085, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1946824789047241, + "rewards/thk_ans_format_reward": 1.0, + "step": 2423, + "think_completion_length": 43.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.421875, + "epoch": 4.094435075885329, + "grad_norm": 11.78903892087091, + "kl": 0.541015625, + "learning_rate": 1.8246205733558178e-07, + "loss": 0.0005, + "reward": 3.5103652477264404, + "reward_std": 0.00921852933242917, + "rewards/final_reward": 1.670252714871993, + "rewards/mask_iou_reward": 0.8351263574359965, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.51036536693573, + "rewards/thk_ans_format_reward": 1.0, + "step": 2424, + "think_completion_length": 45.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.828125, + "epoch": 4.096121416526138, + "grad_norm": 17.37244880870618, + "kl": 0.796875, + "learning_rate": 1.821247892074199e-07, + "loss": 0.0008, + "reward": 3.470105290412903, + "reward_std": 0.09997964650392532, + "rewards/final_reward": 1.7580551283626624, + "rewards/mask_iou_reward": 0.8790275641813312, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4701053500175476, + "rewards/thk_ans_format_reward": 1.0, + "step": 2425, + "think_completion_length": 38.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.34375, + "epoch": 4.097807757166947, + "grad_norm": 148.65573085337613, + "kl": 0.609375, + "learning_rate": 1.8178752107925798e-07, + "loss": 0.0006, + "reward": 2.7674933671951294, + "reward_std": 0.30003902316093445, + "rewards/final_reward": 0.5220741981410059, + "rewards/mask_iou_reward": 0.26103709907050293, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 0.8299932479858398, + "rewards/thk_ans_format_reward": 0.96875, + "step": 2426, + "think_completion_length": 38.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.5625, + "epoch": 4.099494097807757, + "grad_norm": 5.518980680485829, + "kl": 0.564453125, + "learning_rate": 1.8145025295109612e-07, + "loss": 0.0006, + "reward": 3.371211051940918, + "reward_std": 0.15353204309940338, + "rewards/final_reward": 1.6059161935315704, + "rewards/mask_iou_reward": 0.8029580967657852, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3712111115455627, + "rewards/thk_ans_format_reward": 1.0, + "step": 2427, + "think_completion_length": 47.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.140625, + "epoch": 4.101180438448567, + "grad_norm": 8.035275994452993, + "kl": 0.50390625, + "learning_rate": 1.811129848229342e-07, + "loss": 0.0005, + "reward": 3.208521842956543, + "reward_std": 0.09160394221544266, + "rewards/final_reward": 1.791497057160064, + "rewards/mask_iou_reward": 0.895748528580032, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.208521842956543, + "rewards/thk_ans_format_reward": 1.0, + "step": 2428, + "think_completion_length": 41.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.28125, + "epoch": 4.102866779089376, + "grad_norm": 7.516752843962914, + "kl": 0.541015625, + "learning_rate": 1.8077571669477235e-07, + "loss": 0.0005, + "reward": 3.7405242919921875, + "reward_std": 0.03224869258701801, + "rewards/final_reward": 1.962668194004178, + "rewards/mask_iou_reward": 0.981334097002089, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7405242919921875, + "rewards/thk_ans_format_reward": 1.0, + "step": 2429, + "think_completion_length": 44.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.1875, + "epoch": 4.104553119730186, + "grad_norm": 44.142514705287155, + "kl": 0.81640625, + "learning_rate": 1.8043844856661044e-07, + "loss": 0.0008, + "reward": 3.570330858230591, + "reward_std": 0.18656124360859394, + "rewards/final_reward": 1.5419715342322102, + "rewards/mask_iou_reward": 0.7709857671161051, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.570330560207367, + "rewards/thk_ans_format_reward": 1.0, + "step": 2430, + "think_completion_length": 47.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.5625, + "epoch": 4.106239460370995, + "grad_norm": 12.125870111288556, + "kl": 0.568359375, + "learning_rate": 1.8010118043844855e-07, + "loss": 0.0006, + "reward": 3.7397871017456055, + "reward_std": 0.06458837911486626, + "rewards/final_reward": 1.7609626170576957, + "rewards/mask_iou_reward": 0.8804813085288479, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.739786982536316, + "rewards/thk_ans_format_reward": 1.0, + "step": 2431, + "think_completion_length": 36.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.71875, + "epoch": 4.107925801011804, + "grad_norm": 9.271025189840522, + "kl": 0.59765625, + "learning_rate": 1.7976391231028666e-07, + "loss": 0.0006, + "reward": 3.296180844306946, + "reward_std": 0.10032219812273979, + "rewards/final_reward": 0.969565839219187, + "rewards/mask_iou_reward": 0.4847829196095935, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.296180784702301, + "rewards/thk_ans_format_reward": 1.0, + "step": 2432, + "think_completion_length": 43.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.671875, + "epoch": 4.1096121416526135, + "grad_norm": 22.015537297219275, + "kl": 0.560546875, + "learning_rate": 1.7942664418212478e-07, + "loss": 0.0006, + "reward": 3.7425882816314697, + "reward_std": 0.11254134774208069, + "rewards/final_reward": 1.6666927241671812, + "rewards/mask_iou_reward": 0.8333463620835906, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7425881624221802, + "rewards/thk_ans_format_reward": 1.0, + "step": 2433, + "think_completion_length": 44.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.46875, + "epoch": 4.1112984822934235, + "grad_norm": 10.396776683202178, + "kl": 0.54296875, + "learning_rate": 1.790893760539629e-07, + "loss": 0.0006, + "reward": 3.3835560083389282, + "reward_std": 0.17423714324831963, + "rewards/final_reward": 1.2915921295781196, + "rewards/mask_iou_reward": 0.6457960647890598, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3835560083389282, + "rewards/thk_ans_format_reward": 1.0, + "step": 2434, + "think_completion_length": 41.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.375, + "epoch": 4.112984822934233, + "grad_norm": 7.691601405960013, + "kl": 0.623046875, + "learning_rate": 1.78752107925801e-07, + "loss": 0.0006, + "reward": 3.6533315181732178, + "reward_std": 0.07725580548867583, + "rewards/final_reward": 1.5506914329713433, + "rewards/mask_iou_reward": 0.7753457164856716, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.653331458568573, + "rewards/thk_ans_format_reward": 1.0, + "step": 2435, + "think_completion_length": 47.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.859375, + "epoch": 4.114671163575042, + "grad_norm": 4.890209137735515, + "kl": 0.525390625, + "learning_rate": 1.784148397976391e-07, + "loss": 0.0005, + "reward": 2.8774254322052, + "reward_std": 0.21360297221690416, + "rewards/final_reward": 1.0372495292692403, + "rewards/mask_iou_reward": 0.5186247646346202, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8774254620075226, + "rewards/thk_ans_format_reward": 1.0, + "step": 2436, + "think_completion_length": 46.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.953125, + "epoch": 4.116357504215852, + "grad_norm": 6.788718135979743, + "kl": 0.5703125, + "learning_rate": 1.7807757166947723e-07, + "loss": 0.0006, + "reward": 3.4717376232147217, + "reward_std": 0.042540392372757196, + "rewards/final_reward": 1.7633789684762051, + "rewards/mask_iou_reward": 0.8816894842381026, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4717376828193665, + "rewards/thk_ans_format_reward": 1.0, + "step": 2437, + "think_completion_length": 40.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.796875, + "epoch": 4.118043844856661, + "grad_norm": 15.210072320250731, + "kl": 0.646484375, + "learning_rate": 1.7774030354131535e-07, + "loss": 0.0006, + "reward": 3.538661479949951, + "reward_std": 0.12559128180146217, + "rewards/final_reward": 1.2970506437435008, + "rewards/mask_iou_reward": 0.6485253218717504, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5386614799499512, + "rewards/thk_ans_format_reward": 1.0, + "step": 2438, + "think_completion_length": 44.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.296875, + "epoch": 4.11973018549747, + "grad_norm": 12.104083484914264, + "kl": 0.541015625, + "learning_rate": 1.7740303541315346e-07, + "loss": 0.0005, + "reward": 3.2933363914489746, + "reward_std": 0.0966414324939251, + "rewards/final_reward": 1.6069715814968468, + "rewards/mask_iou_reward": 0.8034857907484234, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2933364510536194, + "rewards/thk_ans_format_reward": 1.0, + "step": 2439, + "think_completion_length": 40.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.515625, + "epoch": 4.12141652613828, + "grad_norm": 5.781855397089921, + "kl": 0.55078125, + "learning_rate": 1.7706576728499157e-07, + "loss": 0.0005, + "reward": 3.274218440055847, + "reward_std": 0.04520893655717373, + "rewards/final_reward": 1.4331149297308339, + "rewards/mask_iou_reward": 0.7165574648654169, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.27421835064888, + "rewards/thk_ans_format_reward": 1.0, + "step": 2440, + "think_completion_length": 41.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.59375, + "epoch": 4.12310286677909, + "grad_norm": 8.760429862503628, + "kl": 0.5859375, + "learning_rate": 1.7672849915682966e-07, + "loss": 0.0006, + "reward": 3.4449740648269653, + "reward_std": 0.18864751234650612, + "rewards/final_reward": 1.3188366043049522, + "rewards/mask_iou_reward": 0.6594183021524761, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4449740648269653, + "rewards/thk_ans_format_reward": 1.0, + "step": 2441, + "think_completion_length": 43.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.515625, + "epoch": 4.124789207419899, + "grad_norm": 8.159581656181548, + "kl": 0.5859375, + "learning_rate": 1.763912310286678e-07, + "loss": 0.0006, + "reward": 3.2669503688812256, + "reward_std": 0.11719108745455742, + "rewards/final_reward": 1.5414467634716296, + "rewards/mask_iou_reward": 0.7707233817358148, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2825754284858704, + "rewards/thk_ans_format_reward": 0.984375, + "step": 2442, + "think_completion_length": 38.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.375, + "epoch": 4.126475548060708, + "grad_norm": 6.441309063839323, + "kl": 0.580078125, + "learning_rate": 1.760539629005059e-07, + "loss": 0.0006, + "reward": 3.6620864868164062, + "reward_std": 0.04250127053819597, + "rewards/final_reward": 1.734774837092839, + "rewards/mask_iou_reward": 0.8673874185464195, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6620864272117615, + "rewards/thk_ans_format_reward": 1.0, + "step": 2443, + "think_completion_length": 41.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.515625, + "epoch": 4.128161888701518, + "grad_norm": 5.310654249140433, + "kl": 0.55078125, + "learning_rate": 1.75716694772344e-07, + "loss": 0.0006, + "reward": 3.5324217081069946, + "reward_std": 0.028282339684665203, + "rewards/final_reward": 1.4161529472944359, + "rewards/mask_iou_reward": 0.7080764736472179, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5324216485023499, + "rewards/thk_ans_format_reward": 1.0, + "step": 2444, + "think_completion_length": 40.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.390625, + "epoch": 4.129848229342327, + "grad_norm": 14.968375346591747, + "kl": 0.56640625, + "learning_rate": 1.7537942664418212e-07, + "loss": 0.0006, + "reward": 3.473724842071533, + "reward_std": 0.019786870572715998, + "rewards/final_reward": 1.9477039381400956, + "rewards/mask_iou_reward": 0.9738519690700478, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.473724901676178, + "rewards/thk_ans_format_reward": 1.0, + "step": 2445, + "think_completion_length": 45.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.28125, + "epoch": 4.1315345699831365, + "grad_norm": 11.14283779346565, + "kl": 0.58984375, + "learning_rate": 1.7504215851602023e-07, + "loss": 0.0006, + "reward": 3.430332660675049, + "reward_std": 0.17202283442020416, + "rewards/final_reward": 0.973097065771472, + "rewards/mask_iou_reward": 0.486548532885736, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4303327202796936, + "rewards/thk_ans_format_reward": 1.0, + "step": 2446, + "think_completion_length": 38.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.90625, + "epoch": 4.133220910623946, + "grad_norm": 10.88582244855959, + "kl": 0.56640625, + "learning_rate": 1.7470489038785835e-07, + "loss": 0.0006, + "reward": 3.647831082344055, + "reward_std": 0.0973800290375948, + "rewards/final_reward": 1.5795811030044342, + "rewards/mask_iou_reward": 0.7897905515022171, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6478310227394104, + "rewards/thk_ans_format_reward": 1.0, + "step": 2447, + "think_completion_length": 37.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.84375, + "epoch": 4.134907251264756, + "grad_norm": 6.072609038776836, + "kl": 0.595703125, + "learning_rate": 1.7436762225969646e-07, + "loss": 0.0006, + "reward": 3.5523641109466553, + "reward_std": 0.023783092387020588, + "rewards/final_reward": 1.6333426320749438, + "rewards/mask_iou_reward": 0.8166713160374719, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5523640513420105, + "rewards/thk_ans_format_reward": 1.0, + "step": 2448, + "think_completion_length": 43.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.765625, + "epoch": 4.136593591905565, + "grad_norm": 6.0999813540724555, + "kl": 0.52734375, + "learning_rate": 1.7403035413153455e-07, + "loss": 0.0006, + "reward": 3.1396783590316772, + "reward_std": 0.20230736583471298, + "rewards/final_reward": 1.4668547519450323, + "rewards/mask_iou_reward": 0.7334273759725162, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1396782994270325, + "rewards/thk_ans_format_reward": 1.0, + "step": 2449, + "think_completion_length": 45.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.65625, + "epoch": 4.138279932546374, + "grad_norm": 37.05617431808596, + "kl": 0.5400390625, + "learning_rate": 1.7369308600337269e-07, + "loss": 0.0005, + "reward": 3.4970571994781494, + "reward_std": 0.011977422516793013, + "rewards/final_reward": 1.7754393479215091, + "rewards/mask_iou_reward": 0.8877196739607546, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4970571994781494, + "rewards/thk_ans_format_reward": 1.0, + "step": 2450, + "think_completion_length": 40.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.421875, + "epoch": 4.139966273187184, + "grad_norm": 19.32704554505315, + "kl": 0.59375, + "learning_rate": 1.7335581787521077e-07, + "loss": 0.0006, + "reward": 3.5168113708496094, + "reward_std": 0.04148505628108978, + "rewards/final_reward": 1.1838286003096194, + "rewards/mask_iou_reward": 0.5919143001548097, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5168115496635437, + "rewards/thk_ans_format_reward": 1.0, + "step": 2451, + "think_completion_length": 45.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.203125, + "epoch": 4.141652613827993, + "grad_norm": 10.432641541321457, + "kl": 0.58203125, + "learning_rate": 1.7301854974704891e-07, + "loss": 0.0006, + "reward": 3.374828338623047, + "reward_std": 0.0938992714509368, + "rewards/final_reward": 1.4010326022077564, + "rewards/mask_iou_reward": 0.7005163011038782, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.374828279018402, + "rewards/thk_ans_format_reward": 1.0, + "step": 2452, + "think_completion_length": 47.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.0, + "epoch": 4.143338954468803, + "grad_norm": 6.796553460953023, + "kl": 0.580078125, + "learning_rate": 1.72681281618887e-07, + "loss": 0.0006, + "reward": 3.440047025680542, + "reward_std": 0.07851972058415413, + "rewards/final_reward": 0.9987816165391428, + "rewards/mask_iou_reward": 0.4993908082695714, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4400469660758972, + "rewards/thk_ans_format_reward": 1.0, + "step": 2453, + "think_completion_length": 43.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.84375, + "epoch": 4.145025295109612, + "grad_norm": 138.67722008154183, + "kl": 0.49609375, + "learning_rate": 1.7234401349072512e-07, + "loss": 0.0005, + "reward": 3.4035454988479614, + "reward_std": 0.1103200614452362, + "rewards/final_reward": 1.0903468919454502, + "rewards/mask_iou_reward": 0.5451734459727251, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4035455584526062, + "rewards/thk_ans_format_reward": 1.0, + "step": 2454, + "think_completion_length": 39.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.375, + "epoch": 4.146711635750422, + "grad_norm": 11.689257540526738, + "kl": 0.544921875, + "learning_rate": 1.7200674536256323e-07, + "loss": 0.0005, + "reward": 3.2984225749969482, + "reward_std": 0.030012394301593304, + "rewards/final_reward": 0.8408413380636273, + "rewards/mask_iou_reward": 0.42042066903181363, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2984225749969482, + "rewards/thk_ans_format_reward": 1.0, + "step": 2455, + "think_completion_length": 45.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.359375, + "epoch": 4.148397976391231, + "grad_norm": 11.278926743139742, + "kl": 0.53515625, + "learning_rate": 1.7166947723440134e-07, + "loss": 0.0005, + "reward": 3.174254894256592, + "reward_std": 0.15206933487206697, + "rewards/final_reward": 1.1872710606329515, + "rewards/mask_iou_reward": 0.5936355303164758, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1742548644542694, + "rewards/thk_ans_format_reward": 1.0, + "step": 2456, + "think_completion_length": 42.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.078125, + "epoch": 4.15008431703204, + "grad_norm": 18.17030426571636, + "kl": 0.5625, + "learning_rate": 1.7133220910623943e-07, + "loss": 0.0006, + "reward": 3.169532537460327, + "reward_std": 0.39185091853141785, + "rewards/final_reward": 1.2581744958762393, + "rewards/mask_iou_reward": 0.6290872479381197, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.1851573884487152, + "rewards/thk_ans_format_reward": 1.0, + "step": 2457, + "think_completion_length": 39.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.375, + "epoch": 4.15177065767285, + "grad_norm": 8.148184638625576, + "kl": 0.572265625, + "learning_rate": 1.7099494097807757e-07, + "loss": 0.0006, + "reward": 3.594156265258789, + "reward_std": 0.1541702593676746, + "rewards/final_reward": 1.6085641885897508, + "rewards/mask_iou_reward": 0.8042820942948754, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.594156265258789, + "rewards/thk_ans_format_reward": 1.0, + "step": 2458, + "think_completion_length": 45.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.796875, + "epoch": 4.1534569983136596, + "grad_norm": 68.33425571048502, + "kl": 0.58203125, + "learning_rate": 1.7065767284991566e-07, + "loss": 0.0006, + "reward": 3.1055225133895874, + "reward_std": 0.0070166842779144645, + "rewards/final_reward": 0.7701352773841368, + "rewards/mask_iou_reward": 0.3850676386920684, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1055223941802979, + "rewards/thk_ans_format_reward": 1.0, + "step": 2459, + "think_completion_length": 43.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.0, + "epoch": 4.155143338954469, + "grad_norm": 6.293257412705822, + "kl": 0.54296875, + "learning_rate": 1.703204047217538e-07, + "loss": 0.0005, + "reward": 3.3578845262527466, + "reward_std": 0.28066894970834255, + "rewards/final_reward": 1.5450707270999193, + "rewards/mask_iou_reward": 0.7725353635499597, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3578845262527466, + "rewards/thk_ans_format_reward": 1.0, + "step": 2460, + "think_completion_length": 43.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.1875, + "epoch": 4.156829679595278, + "grad_norm": 58.09452291181341, + "kl": 0.568359375, + "learning_rate": 1.6998313659359189e-07, + "loss": 0.0006, + "reward": 3.3822141885757446, + "reward_std": 0.05487864976748824, + "rewards/final_reward": 1.1080669673818953, + "rewards/mask_iou_reward": 0.5540334836909476, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3822141885757446, + "rewards/thk_ans_format_reward": 1.0, + "step": 2461, + "think_completion_length": 45.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.75, + "epoch": 4.158516020236088, + "grad_norm": 14.957547077861275, + "kl": 0.53515625, + "learning_rate": 1.6964586846543e-07, + "loss": 0.0005, + "reward": 3.4989073276519775, + "reward_std": 0.3742763102054596, + "rewards/final_reward": 1.8025648484496086, + "rewards/mask_iou_reward": 0.9012824242248043, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4989073872566223, + "rewards/thk_ans_format_reward": 1.0, + "step": 2462, + "think_completion_length": 39.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.421875, + "epoch": 4.160202360876897, + "grad_norm": 10.672064968033249, + "kl": 0.552734375, + "learning_rate": 1.6930860033726811e-07, + "loss": 0.0006, + "reward": 3.3404901027679443, + "reward_std": 0.11594452522695065, + "rewards/final_reward": 1.3714671161693603, + "rewards/mask_iou_reward": 0.6857335580846802, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3404901027679443, + "rewards/thk_ans_format_reward": 1.0, + "step": 2463, + "think_completion_length": 42.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.390625, + "epoch": 4.161888701517706, + "grad_norm": 18.616831487721143, + "kl": 0.599609375, + "learning_rate": 1.6897133220910623e-07, + "loss": 0.0006, + "reward": 2.7729495763778687, + "reward_std": 0.059840716421604156, + "rewards/final_reward": 0.555567492024673, + "rewards/mask_iou_reward": 0.2777837460123365, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7729496173560619, + "rewards/thk_ans_format_reward": 1.0, + "step": 2464, + "think_completion_length": 41.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.265625, + "epoch": 4.163575042158516, + "grad_norm": 13.961223657906311, + "kl": 0.6015625, + "learning_rate": 1.6863406408094437e-07, + "loss": 0.0006, + "reward": 3.2185802459716797, + "reward_std": 0.23981109261512756, + "rewards/final_reward": 1.1825092552118748, + "rewards/mask_iou_reward": 0.5912546276059374, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2185802459716797, + "rewards/thk_ans_format_reward": 1.0, + "step": 2465, + "think_completion_length": 45.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.9375, + "epoch": 4.165261382799326, + "grad_norm": 6.115827974735234, + "kl": 0.494140625, + "learning_rate": 1.6829679595278246e-07, + "loss": 0.0005, + "reward": 3.298157572746277, + "reward_std": 0.22440132359042764, + "rewards/final_reward": 1.529379808654268, + "rewards/mask_iou_reward": 0.764689904327134, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2981576323509216, + "rewards/thk_ans_format_reward": 1.0, + "step": 2466, + "think_completion_length": 41.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.421875, + "epoch": 4.166947723440135, + "grad_norm": 7.534310280986116, + "kl": 0.65625, + "learning_rate": 1.6795952782462057e-07, + "loss": 0.0007, + "reward": 3.225925326347351, + "reward_std": 0.0442003165371716, + "rewards/final_reward": 1.2835473520049743, + "rewards/mask_iou_reward": 0.6417736760024871, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2259255051612854, + "rewards/thk_ans_format_reward": 1.0, + "step": 2467, + "think_completion_length": 40.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.34375, + "epoch": 4.168634064080944, + "grad_norm": 8.641948808655975, + "kl": 0.60546875, + "learning_rate": 1.6762225969645868e-07, + "loss": 0.0006, + "reward": 3.5798412561416626, + "reward_std": 0.09165813028812408, + "rewards/final_reward": 1.9735194661697966, + "rewards/mask_iou_reward": 0.9867597330848983, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.579841136932373, + "rewards/thk_ans_format_reward": 1.0, + "step": 2468, + "think_completion_length": 43.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.078125, + "epoch": 4.170320404721754, + "grad_norm": 10.071162143296714, + "kl": 0.59765625, + "learning_rate": 1.672849915682968e-07, + "loss": 0.0006, + "reward": 3.6934038400650024, + "reward_std": 0.06623989366926253, + "rewards/final_reward": 1.7010132420361757, + "rewards/mask_iou_reward": 0.8505066210180878, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6934038996696472, + "rewards/thk_ans_format_reward": 1.0, + "step": 2469, + "think_completion_length": 38.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.65625, + "epoch": 4.172006745362563, + "grad_norm": 7.700699025085925, + "kl": 0.5703125, + "learning_rate": 1.669477234401349e-07, + "loss": 0.0006, + "reward": 3.537416100502014, + "reward_std": 0.07881812565028667, + "rewards/final_reward": 1.569391147970117, + "rewards/mask_iou_reward": 0.7846955739850585, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5374162197113037, + "rewards/thk_ans_format_reward": 1.0, + "step": 2470, + "think_completion_length": 42.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.453125, + "epoch": 4.1736930860033725, + "grad_norm": 7.862675036034808, + "kl": 0.650390625, + "learning_rate": 1.6661045531197302e-07, + "loss": 0.0007, + "reward": 3.441859245300293, + "reward_std": 0.11183974612504244, + "rewards/final_reward": 1.3813556688252486, + "rewards/mask_iou_reward": 0.6906778344126243, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4418591260910034, + "rewards/thk_ans_format_reward": 1.0, + "step": 2471, + "think_completion_length": 40.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.671875, + "epoch": 4.175379426644182, + "grad_norm": 7.741602417773985, + "kl": 0.669921875, + "learning_rate": 1.662731871838111e-07, + "loss": 0.0007, + "reward": 3.3296186923980713, + "reward_std": 0.20109844952821732, + "rewards/final_reward": 1.199130153563452, + "rewards/mask_iou_reward": 0.599565076781726, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3296188116073608, + "rewards/thk_ans_format_reward": 1.0, + "step": 2472, + "think_completion_length": 42.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.3125, + "epoch": 4.177065767284992, + "grad_norm": 14.966449408412934, + "kl": 0.51953125, + "learning_rate": 1.6593591905564925e-07, + "loss": 0.0005, + "reward": 3.700683116912842, + "reward_std": 0.0946221414487809, + "rewards/final_reward": 1.5414615577057402, + "rewards/mask_iou_reward": 0.7707307788528701, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7006831169128418, + "rewards/thk_ans_format_reward": 1.0, + "step": 2473, + "think_completion_length": 44.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.75, + "epoch": 4.178752107925801, + "grad_norm": 7.711478514273177, + "kl": 0.578125, + "learning_rate": 1.6559865092748734e-07, + "loss": 0.0006, + "reward": 3.4979605674743652, + "reward_std": 0.05067999288439751, + "rewards/final_reward": 1.0990489792559652, + "rewards/mask_iou_reward": 0.5495244896279826, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4979605078697205, + "rewards/thk_ans_format_reward": 1.0, + "step": 2474, + "think_completion_length": 41.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.40625, + "epoch": 4.18043844856661, + "grad_norm": 13.284370242674193, + "kl": 0.576171875, + "learning_rate": 1.6526138279932545e-07, + "loss": 0.0006, + "reward": 3.54450786113739, + "reward_std": 0.06997823715209961, + "rewards/final_reward": 1.397211013913966, + "rewards/mask_iou_reward": 0.698605506956983, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5445078611373901, + "rewards/thk_ans_format_reward": 1.0, + "step": 2475, + "think_completion_length": 41.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.71875, + "epoch": 4.18212478920742, + "grad_norm": 7.64312945876125, + "kl": 0.576171875, + "learning_rate": 1.6492411467116357e-07, + "loss": 0.0006, + "reward": 3.5403631925582886, + "reward_std": 0.08483145385980606, + "rewards/final_reward": 1.8252513400297394, + "rewards/mask_iou_reward": 0.9126256700148697, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5403631925582886, + "rewards/thk_ans_format_reward": 1.0, + "step": 2476, + "think_completion_length": 42.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.3125, + "epoch": 4.1838111298482294, + "grad_norm": 7.755037495109282, + "kl": 0.7109375, + "learning_rate": 1.6458684654300168e-07, + "loss": 0.0006, + "reward": 3.5823196172714233, + "reward_std": 0.06578903924673796, + "rewards/final_reward": 1.3312047196853576, + "rewards/mask_iou_reward": 0.6656023598426788, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5823196768760681, + "rewards/thk_ans_format_reward": 1.0, + "step": 2477, + "think_completion_length": 44.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.890625, + "epoch": 4.185497470489039, + "grad_norm": 11.101933961115508, + "kl": 0.603515625, + "learning_rate": 1.642495784148398e-07, + "loss": 0.0006, + "reward": 3.231121063232422, + "reward_std": 0.08925764262676239, + "rewards/final_reward": 1.3585384057033247, + "rewards/mask_iou_reward": 0.6792692028516624, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2311209440231323, + "rewards/thk_ans_format_reward": 1.0, + "step": 2478, + "think_completion_length": 44.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.765625, + "epoch": 4.187183811129848, + "grad_norm": 12.23671466250251, + "kl": 0.576171875, + "learning_rate": 1.639123102866779e-07, + "loss": 0.0006, + "reward": 3.426143169403076, + "reward_std": 0.22152304695919156, + "rewards/final_reward": 1.0948263609425108, + "rewards/mask_iou_reward": 0.5474131804712554, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4261431694030762, + "rewards/thk_ans_format_reward": 1.0, + "step": 2479, + "think_completion_length": 39.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.0625, + "epoch": 4.188870151770658, + "grad_norm": 8.623741136224107, + "kl": 0.53125, + "learning_rate": 1.63575042158516e-07, + "loss": 0.0005, + "reward": 3.5016026496887207, + "reward_std": 0.13351611513644457, + "rewards/final_reward": 1.6067579257978652, + "rewards/mask_iou_reward": 0.8033789628989326, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5016027092933655, + "rewards/thk_ans_format_reward": 1.0, + "step": 2480, + "think_completion_length": 43.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.625, + "epoch": 4.190556492411467, + "grad_norm": 11.972575662820923, + "kl": 0.595703125, + "learning_rate": 1.6323777403035414e-07, + "loss": 0.0006, + "reward": 3.379241704940796, + "reward_std": 0.04323430173099041, + "rewards/final_reward": 1.6247649723289799, + "rewards/mask_iou_reward": 0.8123824861644899, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.379241704940796, + "rewards/thk_ans_format_reward": 1.0, + "step": 2481, + "think_completion_length": 39.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.96875, + "epoch": 4.192242833052276, + "grad_norm": 178.5208892805482, + "kl": 0.5625, + "learning_rate": 1.6290050590219222e-07, + "loss": 0.0006, + "reward": 3.587921619415283, + "reward_std": 0.15851197019219398, + "rewards/final_reward": 1.4456479511328377, + "rewards/mask_iou_reward": 0.7228239755664189, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5879215002059937, + "rewards/thk_ans_format_reward": 1.0, + "step": 2482, + "think_completion_length": 38.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.484375, + "epoch": 4.193929173693086, + "grad_norm": 28.989221718594397, + "kl": 3.2138671875, + "learning_rate": 1.6256323777403036e-07, + "loss": 0.0032, + "reward": 3.507957935333252, + "reward_std": 0.3725784122943878, + "rewards/final_reward": 1.6507667553906227, + "rewards/mask_iou_reward": 0.8253833776953113, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.5235828757286072, + "rewards/thk_ans_format_reward": 1.0, + "step": 2483, + "think_completion_length": 42.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.6875, + "epoch": 4.195615514333896, + "grad_norm": 12.249738021656217, + "kl": 0.54296875, + "learning_rate": 1.6222596964586845e-07, + "loss": 0.0005, + "reward": 3.28781259059906, + "reward_std": 0.047589752823114395, + "rewards/final_reward": 1.53295818019778, + "rewards/mask_iou_reward": 0.76647909009889, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2878124713897705, + "rewards/thk_ans_format_reward": 1.0, + "step": 2484, + "think_completion_length": 47.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.34375, + "epoch": 4.197301854974705, + "grad_norm": 6.0851324746702335, + "kl": 0.560546875, + "learning_rate": 1.6188870151770657e-07, + "loss": 0.0006, + "reward": 3.3584847450256348, + "reward_std": 0.05267183552496135, + "rewards/final_reward": 1.2067760562381666, + "rewards/mask_iou_reward": 0.6033880281190833, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3584846258163452, + "rewards/thk_ans_format_reward": 1.0, + "step": 2485, + "think_completion_length": 37.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.375, + "epoch": 4.198988195615514, + "grad_norm": 8.876805665524541, + "kl": 0.5859375, + "learning_rate": 1.6155143338954468e-07, + "loss": 0.0006, + "reward": 3.3225139379501343, + "reward_std": 0.19829332828521729, + "rewards/final_reward": 1.4276699296541213, + "rewards/mask_iou_reward": 0.7138349648270607, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3225139379501343, + "rewards/thk_ans_format_reward": 1.0, + "step": 2486, + "think_completion_length": 43.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.53125, + "epoch": 4.200674536256324, + "grad_norm": 13.089534712596679, + "kl": 0.701171875, + "learning_rate": 1.612141652613828e-07, + "loss": 0.0007, + "reward": 3.582512617111206, + "reward_std": 0.08671862166374922, + "rewards/final_reward": 1.4499282316972055, + "rewards/mask_iou_reward": 0.7249641158486028, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5825126767158508, + "rewards/thk_ans_format_reward": 1.0, + "step": 2487, + "think_completion_length": 40.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.859375, + "epoch": 4.202360876897133, + "grad_norm": 11.136842154555854, + "kl": 0.587890625, + "learning_rate": 1.6087689713322088e-07, + "loss": 0.0006, + "reward": 3.25620174407959, + "reward_std": 0.008030643686652184, + "rewards/final_reward": 1.237968267670258, + "rewards/mask_iou_reward": 0.618984133835129, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.256201684474945, + "rewards/thk_ans_format_reward": 1.0, + "step": 2488, + "think_completion_length": 41.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.46875, + "epoch": 4.204047217537942, + "grad_norm": 4.78757444111789, + "kl": 0.58984375, + "learning_rate": 1.6053962900505902e-07, + "loss": 0.0006, + "reward": 3.3365434408187866, + "reward_std": 0.09441791824065149, + "rewards/final_reward": 0.8951354420755497, + "rewards/mask_iou_reward": 0.44756772103777487, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3365434408187866, + "rewards/thk_ans_format_reward": 1.0, + "step": 2489, + "think_completion_length": 39.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.5, + "epoch": 4.2057335581787525, + "grad_norm": 15.842252314728153, + "kl": 0.603515625, + "learning_rate": 1.602023608768971e-07, + "loss": 0.0006, + "reward": 3.6544800996780396, + "reward_std": 0.0645940825343132, + "rewards/final_reward": 1.71414012021267, + "rewards/mask_iou_reward": 0.857070060106335, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6544801592826843, + "rewards/thk_ans_format_reward": 1.0, + "step": 2490, + "think_completion_length": 39.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.5625, + "epoch": 4.207419898819562, + "grad_norm": 5.957565108696784, + "kl": 0.5341796875, + "learning_rate": 1.5986509274873525e-07, + "loss": 0.0005, + "reward": 3.7944085597991943, + "reward_std": 0.08072686195373535, + "rewards/final_reward": 1.7540967487275085, + "rewards/mask_iou_reward": 0.8770483743637543, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7944085597991943, + "rewards/thk_ans_format_reward": 1.0, + "step": 2491, + "think_completion_length": 43.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.390625, + "epoch": 4.209106239460371, + "grad_norm": 17.524296887738412, + "kl": 0.6015625, + "learning_rate": 1.5952782462057334e-07, + "loss": 0.0005, + "reward": 3.2337619066238403, + "reward_std": 0.11936390213668346, + "rewards/final_reward": 1.1093958536501025, + "rewards/mask_iou_reward": 0.5546979268250513, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2337620556354523, + "rewards/thk_ans_format_reward": 1.0, + "step": 2492, + "think_completion_length": 47.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.234375, + "epoch": 4.21079258010118, + "grad_norm": 26.279744573862683, + "kl": 0.587890625, + "learning_rate": 1.5919055649241145e-07, + "loss": 0.0006, + "reward": 3.144417881965637, + "reward_std": 0.1013356763869524, + "rewards/final_reward": 0.8731246791024757, + "rewards/mask_iou_reward": 0.43656233955123785, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1444178223609924, + "rewards/thk_ans_format_reward": 1.0, + "step": 2493, + "think_completion_length": 46.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.015625, + "epoch": 4.21247892074199, + "grad_norm": 6.877698788213373, + "kl": 0.638671875, + "learning_rate": 1.5885328836424956e-07, + "loss": 0.0006, + "reward": 3.605825662612915, + "reward_std": 0.023511327803134918, + "rewards/final_reward": 1.4181790648223518, + "rewards/mask_iou_reward": 0.7090895324111759, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6058255434036255, + "rewards/thk_ans_format_reward": 1.0, + "step": 2494, + "think_completion_length": 42.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 235.078125, + "epoch": 4.214165261382799, + "grad_norm": 5.386440086217419, + "kl": 0.48046875, + "learning_rate": 1.5851602023608768e-07, + "loss": 0.0005, + "reward": 3.653342127799988, + "reward_std": 0.09512594901025295, + "rewards/final_reward": 1.6142510430444, + "rewards/mask_iou_reward": 0.8071255215222, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.653342068195343, + "rewards/thk_ans_format_reward": 1.0, + "step": 2495, + "think_completion_length": 40.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.484375, + "epoch": 4.2158516020236085, + "grad_norm": 12.123805188988428, + "kl": 0.603515625, + "learning_rate": 1.5817875210792582e-07, + "loss": 0.0006, + "reward": 3.4331681728363037, + "reward_std": 0.1368698626756668, + "rewards/final_reward": 1.6940559228465024, + "rewards/mask_iou_reward": 0.8470279614232512, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.433168113231659, + "rewards/thk_ans_format_reward": 1.0, + "step": 2496, + "think_completion_length": 45.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.90625, + "epoch": 4.217537942664419, + "grad_norm": 9.518964659803792, + "kl": 0.576171875, + "learning_rate": 1.578414839797639e-07, + "loss": 0.0006, + "reward": 3.535395383834839, + "reward_std": 0.05820541735738516, + "rewards/final_reward": 1.4662414891515558, + "rewards/mask_iou_reward": 0.7331207445757779, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5353953838348389, + "rewards/thk_ans_format_reward": 1.0, + "step": 2497, + "think_completion_length": 39.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.234375, + "epoch": 4.219224283305228, + "grad_norm": 7.8898539600019015, + "kl": 0.78125, + "learning_rate": 1.5750421585160202e-07, + "loss": 0.0008, + "reward": 3.6014208793640137, + "reward_std": 0.017208684235811234, + "rewards/final_reward": 1.6433232711184682, + "rewards/mask_iou_reward": 0.8216616355592341, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6014208793640137, + "rewards/thk_ans_format_reward": 1.0, + "step": 2498, + "think_completion_length": 44.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.25, + "epoch": 4.220910623946037, + "grad_norm": 5.996096425999949, + "kl": 0.5859375, + "learning_rate": 1.5716694772344013e-07, + "loss": 0.0006, + "reward": 3.2900307178497314, + "reward_std": 0.4222857290878892, + "rewards/final_reward": 0.8600683159762403, + "rewards/mask_iou_reward": 0.43003415798812017, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2900307774543762, + "rewards/thk_ans_format_reward": 1.0, + "step": 2499, + "think_completion_length": 50.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.03125, + "epoch": 4.222596964586846, + "grad_norm": 17.096377692982486, + "kl": 0.59375, + "learning_rate": 1.5682967959527825e-07, + "loss": 0.0006, + "reward": 3.5150952339172363, + "reward_std": 0.08423554711043835, + "rewards/final_reward": 1.6300621768573955, + "rewards/mask_iou_reward": 0.8150310884286978, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5150951743125916, + "rewards/thk_ans_format_reward": 1.0, + "step": 2500, + "think_completion_length": 42.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.390625, + "epoch": 4.224283305227656, + "grad_norm": 12.643943836787773, + "kl": 0.583984375, + "learning_rate": 1.5649241146711636e-07, + "loss": 0.0006, + "reward": 3.491933822631836, + "reward_std": 0.1002687681466341, + "rewards/final_reward": 1.338027084975392, + "rewards/mask_iou_reward": 0.669013542487696, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.491933822631836, + "rewards/thk_ans_format_reward": 1.0, + "step": 2501, + "think_completion_length": 40.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.0, + "epoch": 4.2259696458684655, + "grad_norm": 5.584911428828709, + "kl": 0.5546875, + "learning_rate": 1.5615514333895447e-07, + "loss": 0.0005, + "reward": 3.7746880054473877, + "reward_std": 0.10910388245247304, + "rewards/final_reward": 1.7356168712740996, + "rewards/mask_iou_reward": 0.8678084356370498, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7746880054473877, + "rewards/thk_ans_format_reward": 1.0, + "step": 2502, + "think_completion_length": 39.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.5, + "epoch": 4.227655986509275, + "grad_norm": 8.64637878813344, + "kl": 0.640625, + "learning_rate": 1.5581787521079256e-07, + "loss": 0.0006, + "reward": 3.6966729164123535, + "reward_std": 0.04432039085077122, + "rewards/final_reward": 1.5541293493811987, + "rewards/mask_iou_reward": 0.7770646746905994, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6966729164123535, + "rewards/thk_ans_format_reward": 1.0, + "step": 2503, + "think_completion_length": 42.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.390625, + "epoch": 4.229342327150085, + "grad_norm": 11.840944781081609, + "kl": 0.61328125, + "learning_rate": 1.554806070826307e-07, + "loss": 0.0006, + "reward": 3.6088430881500244, + "reward_std": 0.04924646159633994, + "rewards/final_reward": 1.4039237413250574, + "rewards/mask_iou_reward": 0.7019618706625287, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.608843207359314, + "rewards/thk_ans_format_reward": 1.0, + "step": 2504, + "think_completion_length": 47.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.875, + "epoch": 4.231028667790894, + "grad_norm": 6.119799545502641, + "kl": 0.69921875, + "learning_rate": 1.551433389544688e-07, + "loss": 0.0007, + "reward": 3.3514033555984497, + "reward_std": 0.1196487583220005, + "rewards/final_reward": 1.2155702096087642, + "rewards/mask_iou_reward": 0.6077851048043821, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3514032363891602, + "rewards/thk_ans_format_reward": 1.0, + "step": 2505, + "think_completion_length": 46.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.59375, + "epoch": 4.232715008431703, + "grad_norm": 76.93348122363103, + "kl": 0.568359375, + "learning_rate": 1.548060708263069e-07, + "loss": 0.0006, + "reward": 3.7489959001541138, + "reward_std": 0.15297742490656674, + "rewards/final_reward": 1.7989957097859706, + "rewards/mask_iou_reward": 0.8994978548929853, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7489957809448242, + "rewards/thk_ans_format_reward": 1.0, + "step": 2506, + "think_completion_length": 41.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.09375, + "epoch": 4.234401349072512, + "grad_norm": 10.290057729479736, + "kl": 0.548828125, + "learning_rate": 1.5446880269814502e-07, + "loss": 0.0006, + "reward": 3.3262614011764526, + "reward_std": 0.11492184177041054, + "rewards/final_reward": 1.1012222042789321, + "rewards/mask_iou_reward": 0.5506111021394661, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3262612223625183, + "rewards/thk_ans_format_reward": 1.0, + "step": 2507, + "think_completion_length": 45.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.359375, + "epoch": 4.236087689713322, + "grad_norm": 5.491342812996931, + "kl": 0.482421875, + "learning_rate": 1.5413153456998313e-07, + "loss": 0.0005, + "reward": 3.218244433403015, + "reward_std": 0.14265291579067707, + "rewards/final_reward": 0.9250615596363921, + "rewards/mask_iou_reward": 0.46253077981819607, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2182443737983704, + "rewards/thk_ans_format_reward": 1.0, + "step": 2508, + "think_completion_length": 41.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.578125, + "epoch": 4.237774030354132, + "grad_norm": 8.256158244019627, + "kl": 0.560546875, + "learning_rate": 1.5379426644182125e-07, + "loss": 0.0006, + "reward": 3.20368230342865, + "reward_std": 0.033036405220627785, + "rewards/final_reward": 0.9446484949510479, + "rewards/mask_iou_reward": 0.47232424747552393, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2036822140216827, + "rewards/thk_ans_format_reward": 1.0, + "step": 2509, + "think_completion_length": 43.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.578125, + "epoch": 4.239460370994941, + "grad_norm": 10.240281506736242, + "kl": 0.580078125, + "learning_rate": 1.5345699831365936e-07, + "loss": 0.0006, + "reward": 3.5657538175582886, + "reward_std": 0.4374186247587204, + "rewards/final_reward": 1.715664315244335, + "rewards/mask_iou_reward": 0.8578321576221675, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5657540559768677, + "rewards/thk_ans_format_reward": 1.0, + "step": 2510, + "think_completion_length": 45.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.765625, + "epoch": 4.24114671163575, + "grad_norm": 18.110017710636633, + "kl": 0.607421875, + "learning_rate": 1.5311973018549745e-07, + "loss": 0.0006, + "reward": 3.4502862691879272, + "reward_std": 0.2845083028078079, + "rewards/final_reward": 1.2480094396741797, + "rewards/mask_iou_reward": 0.6240047198370898, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4502861499786377, + "rewards/thk_ans_format_reward": 1.0, + "step": 2511, + "think_completion_length": 46.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.578125, + "epoch": 4.24283305227656, + "grad_norm": 13.761643496398287, + "kl": 0.53515625, + "learning_rate": 1.527824620573356e-07, + "loss": 0.0006, + "reward": 3.0231703519821167, + "reward_std": 0.06616012193262577, + "rewards/final_reward": 1.2360284179623469, + "rewards/mask_iou_reward": 0.6180142089811734, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0231703221797943, + "rewards/thk_ans_format_reward": 1.0, + "step": 2512, + "think_completion_length": 40.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.875, + "epoch": 4.244519392917369, + "grad_norm": 16.104599863716597, + "kl": 0.52734375, + "learning_rate": 1.5244519392917367e-07, + "loss": 0.0005, + "reward": 3.3228198289871216, + "reward_std": 0.13831503875553608, + "rewards/final_reward": 0.949170678148781, + "rewards/mask_iou_reward": 0.4745853390743905, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3228199481964111, + "rewards/thk_ans_format_reward": 1.0, + "step": 2513, + "think_completion_length": 47.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.03125, + "epoch": 4.246205733558178, + "grad_norm": 6.788400625001057, + "kl": 0.5, + "learning_rate": 1.5210792580101181e-07, + "loss": 0.0005, + "reward": 3.593444347381592, + "reward_std": 0.060188669711351395, + "rewards/final_reward": 1.4872439563748985, + "rewards/mask_iou_reward": 0.7436219781874492, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5934444069862366, + "rewards/thk_ans_format_reward": 1.0, + "step": 2514, + "think_completion_length": 44.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.671875, + "epoch": 4.2478920741989885, + "grad_norm": 9.103261919282126, + "kl": 0.5859375, + "learning_rate": 1.517706576728499e-07, + "loss": 0.0006, + "reward": 3.8253079652786255, + "reward_std": 0.04512532241642475, + "rewards/final_reward": 1.76673815732204, + "rewards/mask_iou_reward": 0.88336907866102, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8253080248832703, + "rewards/thk_ans_format_reward": 1.0, + "step": 2515, + "think_completion_length": 41.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.765625, + "epoch": 4.249578414839798, + "grad_norm": 9.591163151149308, + "kl": 0.4736328125, + "learning_rate": 1.5143338954468802e-07, + "loss": 0.0005, + "reward": 3.2388182878494263, + "reward_std": 0.13420674204826355, + "rewards/final_reward": 1.490955949030143, + "rewards/mask_iou_reward": 0.7454779745150715, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.238818347454071, + "rewards/thk_ans_format_reward": 1.0, + "step": 2516, + "think_completion_length": 41.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.515625, + "epoch": 4.251264755480607, + "grad_norm": 6.9616532584685835, + "kl": 0.576171875, + "learning_rate": 1.5109612141652613e-07, + "loss": 0.0006, + "reward": 3.499723196029663, + "reward_std": 0.24417180567979813, + "rewards/final_reward": 1.3162671110819626, + "rewards/mask_iou_reward": 0.6581335555409813, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4997231364250183, + "rewards/thk_ans_format_reward": 1.0, + "step": 2517, + "think_completion_length": 40.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.5625, + "epoch": 4.252951096121416, + "grad_norm": 7.3075343624463684, + "kl": 0.603515625, + "learning_rate": 1.5075885328836424e-07, + "loss": 0.0006, + "reward": 3.616728186607361, + "reward_std": 0.013671865686774254, + "rewards/final_reward": 1.4692096036141498, + "rewards/mask_iou_reward": 0.7346048018070749, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6167281866073608, + "rewards/thk_ans_format_reward": 1.0, + "step": 2518, + "think_completion_length": 42.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.15625, + "epoch": 4.254637436762226, + "grad_norm": 9.351763371801017, + "kl": 0.546875, + "learning_rate": 1.5042158516020233e-07, + "loss": 0.0005, + "reward": 3.1668334007263184, + "reward_std": 0.02611308917403221, + "rewards/final_reward": 1.299961387052443, + "rewards/mask_iou_reward": 0.6499806935262215, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1668334603309631, + "rewards/thk_ans_format_reward": 1.0, + "step": 2519, + "think_completion_length": 45.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.09375, + "epoch": 4.256323777403035, + "grad_norm": 22.596767977071785, + "kl": 0.541015625, + "learning_rate": 1.5008431703204047e-07, + "loss": 0.0005, + "reward": 3.0126872062683105, + "reward_std": 0.11716877296566963, + "rewards/final_reward": 1.1052302769538236, + "rewards/mask_iou_reward": 0.5526151384769118, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0126870572566986, + "rewards/thk_ans_format_reward": 1.0, + "step": 2520, + "think_completion_length": 41.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.84375, + "epoch": 4.2580101180438445, + "grad_norm": 38.95245679064524, + "kl": 0.57421875, + "learning_rate": 1.4974704890387856e-07, + "loss": 0.0006, + "reward": 3.564436197280884, + "reward_std": 0.24005300551652908, + "rewards/final_reward": 1.551153769610408, + "rewards/mask_iou_reward": 0.775576884805204, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5644361972808838, + "rewards/thk_ans_format_reward": 1.0, + "step": 2521, + "think_completion_length": 42.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.953125, + "epoch": 4.259696458684655, + "grad_norm": 12.630445249364401, + "kl": 0.53515625, + "learning_rate": 1.494097807757167e-07, + "loss": 0.0005, + "reward": 3.7122442722320557, + "reward_std": 0.03813726641237736, + "rewards/final_reward": 1.8947568158228092, + "rewards/mask_iou_reward": 0.9473784079114046, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7122442722320557, + "rewards/thk_ans_format_reward": 1.0, + "step": 2522, + "think_completion_length": 48.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.375, + "epoch": 4.261382799325464, + "grad_norm": 7.167206354353222, + "kl": 0.61328125, + "learning_rate": 1.4907251264755479e-07, + "loss": 0.0006, + "reward": 3.8314647674560547, + "reward_std": 0.010048975702375174, + "rewards/final_reward": 1.8508296433772948, + "rewards/mask_iou_reward": 0.9254148216886474, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.83146470785141, + "rewards/thk_ans_format_reward": 1.0, + "step": 2523, + "think_completion_length": 41.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.359375, + "epoch": 4.263069139966273, + "grad_norm": 9.268448682232227, + "kl": 0.61328125, + "learning_rate": 1.487352445193929e-07, + "loss": 0.0006, + "reward": 3.4042723178863525, + "reward_std": 0.22066151723265648, + "rewards/final_reward": 1.3187135732367026, + "rewards/mask_iou_reward": 0.6593567866183513, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4042723178863525, + "rewards/thk_ans_format_reward": 1.0, + "step": 2524, + "think_completion_length": 44.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.578125, + "epoch": 4.264755480607082, + "grad_norm": 9.88039804584237, + "kl": 0.556640625, + "learning_rate": 1.4839797639123104e-07, + "loss": 0.0006, + "reward": 3.5861204862594604, + "reward_std": 0.09150342643260956, + "rewards/final_reward": 1.6389049707249272, + "rewards/mask_iou_reward": 0.8194524853624636, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.586120367050171, + "rewards/thk_ans_format_reward": 1.0, + "step": 2525, + "think_completion_length": 38.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.921875, + "epoch": 4.266441821247892, + "grad_norm": 15.943676525441946, + "kl": 0.59765625, + "learning_rate": 1.4806070826306913e-07, + "loss": 0.0006, + "reward": 3.186599612236023, + "reward_std": 0.08903985074721277, + "rewards/final_reward": 1.1325104068487184, + "rewards/mask_iou_reward": 0.5662552034243592, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1865995526313782, + "rewards/thk_ans_format_reward": 1.0, + "step": 2526, + "think_completion_length": 45.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.25, + "epoch": 4.2681281618887015, + "grad_norm": 12.867814490080903, + "kl": 0.5390625, + "learning_rate": 1.4772344013490727e-07, + "loss": 0.0005, + "reward": 2.867884874343872, + "reward_std": 0.09711506590247154, + "rewards/final_reward": 0.8848869288997784, + "rewards/mask_iou_reward": 0.4424434644498892, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8678848743438721, + "rewards/thk_ans_format_reward": 1.0, + "step": 2527, + "think_completion_length": 41.53125 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.9375, + "epoch": 4.269814502529511, + "grad_norm": 18.68930223402884, + "kl": 0.59765625, + "learning_rate": 1.4738617200674536e-07, + "loss": 0.0006, + "reward": 3.5304031372070312, + "reward_std": 0.1704651303589344, + "rewards/final_reward": 1.887731740474944, + "rewards/mask_iou_reward": 0.943865870237472, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5304031372070312, + "rewards/thk_ans_format_reward": 1.0, + "step": 2528, + "think_completion_length": 43.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.59375, + "epoch": 4.271500843170321, + "grad_norm": 7.147821963050068, + "kl": 0.73046875, + "learning_rate": 1.4704890387858347e-07, + "loss": 0.0007, + "reward": 3.2026796340942383, + "reward_std": 0.2595672160387039, + "rewards/final_reward": 1.6295541597620495, + "rewards/mask_iou_reward": 0.8147770798810248, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.202679693698883, + "rewards/thk_ans_format_reward": 1.0, + "step": 2529, + "think_completion_length": 40.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.109375, + "epoch": 4.27318718381113, + "grad_norm": 8.772915963221111, + "kl": 0.65625, + "learning_rate": 1.4671163575042158e-07, + "loss": 0.0007, + "reward": 3.244017481803894, + "reward_std": 0.1081763282418251, + "rewards/final_reward": 1.2652142418705377, + "rewards/mask_iou_reward": 0.6326071209352688, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2440175414085388, + "rewards/thk_ans_format_reward": 1.0, + "step": 2530, + "think_completion_length": 43.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.921875, + "epoch": 4.274873524451939, + "grad_norm": 14.419830166789696, + "kl": 0.599609375, + "learning_rate": 1.463743676222597e-07, + "loss": 0.0006, + "reward": 2.8510278463363647, + "reward_std": 0.06465473957359791, + "rewards/final_reward": 0.6454403339958547, + "rewards/mask_iou_reward": 0.3227201669979273, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8510278761386871, + "rewards/thk_ans_format_reward": 1.0, + "step": 2531, + "think_completion_length": 41.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.8125, + "epoch": 4.276559865092748, + "grad_norm": 30.39215143435121, + "kl": 0.60546875, + "learning_rate": 1.4603709949409778e-07, + "loss": 0.0006, + "reward": 3.8287590742111206, + "reward_std": 0.10854035732336342, + "rewards/final_reward": 1.8429312055969538, + "rewards/mask_iou_reward": 0.9214656027984769, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8287590742111206, + "rewards/thk_ans_format_reward": 1.0, + "step": 2532, + "think_completion_length": 40.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.125, + "epoch": 4.278246205733558, + "grad_norm": 6.361875856345234, + "kl": 0.5078125, + "learning_rate": 1.4569983136593593e-07, + "loss": 0.0005, + "reward": 3.4332364797592163, + "reward_std": 0.13735715672373772, + "rewards/final_reward": 0.897129527052885, + "rewards/mask_iou_reward": 0.4485647635264425, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.433236539363861, + "rewards/thk_ans_format_reward": 1.0, + "step": 2533, + "think_completion_length": 45.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.78125, + "epoch": 4.279932546374368, + "grad_norm": 7.905426948466219, + "kl": 0.533203125, + "learning_rate": 1.45362563237774e-07, + "loss": 0.0005, + "reward": 3.7020827531814575, + "reward_std": 0.2393525391817093, + "rewards/final_reward": 1.5801495385519464, + "rewards/mask_iou_reward": 0.7900747692759732, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7020828127861023, + "rewards/thk_ans_format_reward": 1.0, + "step": 2534, + "think_completion_length": 41.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.359375, + "epoch": 4.281618887015177, + "grad_norm": 21.16779988755827, + "kl": 0.58984375, + "learning_rate": 1.4502529510961215e-07, + "loss": 0.0006, + "reward": 3.296600341796875, + "reward_std": 0.11001870781183243, + "rewards/final_reward": 0.994581774113384, + "rewards/mask_iou_reward": 0.497290887056692, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.296600341796875, + "rewards/thk_ans_format_reward": 1.0, + "step": 2535, + "think_completion_length": 42.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.15625, + "epoch": 4.283305227655987, + "grad_norm": 34.200597672922946, + "kl": 0.525390625, + "learning_rate": 1.4468802698145024e-07, + "loss": 0.0006, + "reward": 3.4385606050491333, + "reward_std": 0.1196369118988514, + "rewards/final_reward": 1.5373945337333406, + "rewards/mask_iou_reward": 0.7686972668666703, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4385607242584229, + "rewards/thk_ans_format_reward": 1.0, + "step": 2536, + "think_completion_length": 39.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.8125, + "epoch": 4.284991568296796, + "grad_norm": 8.527478808694564, + "kl": 0.5546875, + "learning_rate": 1.4435075885328835e-07, + "loss": 0.0006, + "reward": 3.402387022972107, + "reward_std": 0.13465053914114833, + "rewards/final_reward": 1.165708602414731, + "rewards/mask_iou_reward": 0.5828543012073655, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4023869633674622, + "rewards/thk_ans_format_reward": 1.0, + "step": 2537, + "think_completion_length": 38.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.140625, + "epoch": 4.286677908937605, + "grad_norm": 7.455663700142302, + "kl": 0.5859375, + "learning_rate": 1.4401349072512647e-07, + "loss": 0.0006, + "reward": 3.49160373210907, + "reward_std": 0.08658642042428255, + "rewards/final_reward": 1.4236609122565655, + "rewards/mask_iou_reward": 0.7118304561282828, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4916037321090698, + "rewards/thk_ans_format_reward": 1.0, + "step": 2538, + "think_completion_length": 42.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.703125, + "epoch": 4.2883642495784144, + "grad_norm": 7.536923008415111, + "kl": 0.59765625, + "learning_rate": 1.4367622259696458e-07, + "loss": 0.0006, + "reward": 3.7058597803115845, + "reward_std": 0.0078626349568367, + "rewards/final_reward": 1.8276194686022689, + "rewards/mask_iou_reward": 0.9138097343011344, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7058596014976501, + "rewards/thk_ans_format_reward": 1.0, + "step": 2539, + "think_completion_length": 41.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.484375, + "epoch": 4.2900505902192245, + "grad_norm": 7.481946672048057, + "kl": 0.568359375, + "learning_rate": 1.433389544688027e-07, + "loss": 0.0006, + "reward": 3.454736828804016, + "reward_std": 0.10277345031499863, + "rewards/final_reward": 1.76114098749309, + "rewards/mask_iou_reward": 0.880570493746545, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4547368288040161, + "rewards/thk_ans_format_reward": 1.0, + "step": 2540, + "think_completion_length": 46.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.3125, + "epoch": 4.291736930860034, + "grad_norm": 23.34801402220961, + "kl": 0.470703125, + "learning_rate": 1.430016863406408e-07, + "loss": 0.0005, + "reward": 3.748860239982605, + "reward_std": 0.05715477233752608, + "rewards/final_reward": 1.6972026995457257, + "rewards/mask_iou_reward": 0.8486013497728628, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.748860478401184, + "rewards/thk_ans_format_reward": 1.0, + "step": 2541, + "think_completion_length": 39.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.703125, + "epoch": 4.293423271500843, + "grad_norm": 7.4552507036217355, + "kl": 0.578125, + "learning_rate": 1.426644182124789e-07, + "loss": 0.0005, + "reward": 3.2460508346557617, + "reward_std": 0.09356878604739904, + "rewards/final_reward": 1.2559207078245218, + "rewards/mask_iou_reward": 0.6279603539122609, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.246050864458084, + "rewards/thk_ans_format_reward": 1.0, + "step": 2542, + "think_completion_length": 37.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.125, + "epoch": 4.295109612141653, + "grad_norm": 10.08757853605006, + "kl": 0.5546875, + "learning_rate": 1.4232715008431704e-07, + "loss": 0.0004, + "reward": 3.878480553627014, + "reward_std": 0.010050483266240917, + "rewards/final_reward": 1.878368428642306, + "rewards/mask_iou_reward": 0.939184214321153, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8784804940223694, + "rewards/thk_ans_format_reward": 1.0, + "step": 2543, + "think_completion_length": 46.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.890625, + "epoch": 4.296795952782462, + "grad_norm": 11.763135638839753, + "kl": 0.611328125, + "learning_rate": 1.4198988195615512e-07, + "loss": 0.0006, + "reward": 3.5589308738708496, + "reward_std": 0.05203424580395222, + "rewards/final_reward": 1.8302980244920706, + "rewards/mask_iou_reward": 0.9151490122460353, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.55893075466156, + "rewards/thk_ans_format_reward": 1.0, + "step": 2544, + "think_completion_length": 42.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.40625, + "epoch": 4.298482293423271, + "grad_norm": 24.036707132251024, + "kl": 0.509765625, + "learning_rate": 1.4165261382799326e-07, + "loss": 0.0005, + "reward": 3.25858473777771, + "reward_std": 0.04267970938235521, + "rewards/final_reward": 1.1185800974288043, + "rewards/mask_iou_reward": 0.5592900487144021, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.258584588766098, + "rewards/thk_ans_format_reward": 1.0, + "step": 2545, + "think_completion_length": 42.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.640625, + "epoch": 4.300168634064081, + "grad_norm": 7.313306979254338, + "kl": 0.548828125, + "learning_rate": 1.4131534569983135e-07, + "loss": 0.0006, + "reward": 3.6531234979629517, + "reward_std": 0.046843864023685455, + "rewards/final_reward": 1.6615908392074423, + "rewards/mask_iou_reward": 0.8307954196037212, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.653123378753662, + "rewards/thk_ans_format_reward": 1.0, + "step": 2546, + "think_completion_length": 40.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.984375, + "epoch": 4.301854974704891, + "grad_norm": 12.271146225861086, + "kl": 0.556640625, + "learning_rate": 1.4097807757166947e-07, + "loss": 0.0006, + "reward": 3.4084343910217285, + "reward_std": 0.17545827478170395, + "rewards/final_reward": 1.5821885705577525, + "rewards/mask_iou_reward": 0.7910942852788763, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4084343910217285, + "rewards/thk_ans_format_reward": 1.0, + "step": 2547, + "think_completion_length": 40.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.078125, + "epoch": 4.3035413153457, + "grad_norm": 6.176229533488846, + "kl": 0.623046875, + "learning_rate": 1.4064080944350758e-07, + "loss": 0.0006, + "reward": 3.5700900554656982, + "reward_std": 0.14789751917123795, + "rewards/final_reward": 1.482314263419597, + "rewards/mask_iou_reward": 0.7411571317097985, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.570090115070343, + "rewards/thk_ans_format_reward": 1.0, + "step": 2548, + "think_completion_length": 42.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.875, + "epoch": 4.305227655986509, + "grad_norm": 13.243385182075865, + "kl": 0.58984375, + "learning_rate": 1.403035413153457e-07, + "loss": 0.0006, + "reward": 3.7301249504089355, + "reward_std": 0.1095227412879467, + "rewards/final_reward": 1.5400399812358647, + "rewards/mask_iou_reward": 0.7700199906179324, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7301249504089355, + "rewards/thk_ans_format_reward": 1.0, + "step": 2549, + "think_completion_length": 37.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.0625, + "epoch": 4.306913996627319, + "grad_norm": 14.557143592798408, + "kl": 0.611328125, + "learning_rate": 1.3996627318718378e-07, + "loss": 0.0006, + "reward": 3.8407446146011353, + "reward_std": 0.014246857725083828, + "rewards/final_reward": 1.929611008155712, + "rewards/mask_iou_reward": 0.964805504077856, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8407444953918457, + "rewards/thk_ans_format_reward": 1.0, + "step": 2550, + "think_completion_length": 39.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.21875, + "epoch": 4.308600337268128, + "grad_norm": 7.4757362832284615, + "kl": 0.5546875, + "learning_rate": 1.3962900505902192e-07, + "loss": 0.0006, + "reward": 3.779623031616211, + "reward_std": 0.0036925169406458735, + "rewards/final_reward": 1.8417629323380633, + "rewards/mask_iou_reward": 0.9208814661690317, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7796229720115662, + "rewards/thk_ans_format_reward": 1.0, + "step": 2551, + "think_completion_length": 41.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.953125, + "epoch": 4.3102866779089375, + "grad_norm": 17.75758948815859, + "kl": 0.609375, + "learning_rate": 1.3929173693086e-07, + "loss": 0.0006, + "reward": 3.1587305068969727, + "reward_std": 0.024611515924334526, + "rewards/final_reward": 1.088152989525369, + "rewards/mask_iou_reward": 0.5440764947626845, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1587304770946503, + "rewards/thk_ans_format_reward": 1.0, + "step": 2552, + "think_completion_length": 41.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.5625, + "epoch": 4.311973018549747, + "grad_norm": 9.284946231608782, + "kl": 0.4990234375, + "learning_rate": 1.3895446880269815e-07, + "loss": 0.0005, + "reward": 3.5739543437957764, + "reward_std": 0.07344475947320461, + "rewards/final_reward": 1.3897168341802106, + "rewards/mask_iou_reward": 0.6948584170901053, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5739544034004211, + "rewards/thk_ans_format_reward": 1.0, + "step": 2553, + "think_completion_length": 43.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.6875, + "epoch": 4.313659359190557, + "grad_norm": 18.14816708538634, + "kl": 0.59375, + "learning_rate": 1.3861720067453624e-07, + "loss": 0.0007, + "reward": 3.52009117603302, + "reward_std": 0.15692077949643135, + "rewards/final_reward": 1.8011629865816787, + "rewards/mask_iou_reward": 0.9005814932908394, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5200912952423096, + "rewards/thk_ans_format_reward": 1.0, + "step": 2554, + "think_completion_length": 41.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.5, + "epoch": 4.315345699831366, + "grad_norm": 26.487807469250537, + "kl": 0.5859375, + "learning_rate": 1.3827993254637435e-07, + "loss": 0.0006, + "reward": 2.9428672790527344, + "reward_std": 0.07025075890123844, + "rewards/final_reward": 1.0398247604871105, + "rewards/mask_iou_reward": 0.5199123802435552, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9428671598434448, + "rewards/thk_ans_format_reward": 1.0, + "step": 2555, + "think_completion_length": 36.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.21875, + "epoch": 4.317032040472175, + "grad_norm": 5.5807297955268576, + "kl": 0.458984375, + "learning_rate": 1.379426644182125e-07, + "loss": 0.0005, + "reward": 3.097283959388733, + "reward_std": 0.4455588236451149, + "rewards/final_reward": 0.8390536942603471, + "rewards/mask_iou_reward": 0.4195268471301736, + "rewards/sam_format_reward": 0.953125, + "rewards/sam_reward_func_ultra": 1.191033959388733, + "rewards/thk_ans_format_reward": 0.953125, + "step": 2556, + "think_completion_length": 47.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.890625, + "epoch": 4.318718381112985, + "grad_norm": 8.374934525891868, + "kl": 0.693359375, + "learning_rate": 1.3760539629005058e-07, + "loss": 0.0007, + "reward": 3.074215888977051, + "reward_std": 0.0957517126807943, + "rewards/final_reward": 1.4687460351048571, + "rewards/mask_iou_reward": 0.7343730175524286, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0742157995700836, + "rewards/thk_ans_format_reward": 1.0, + "step": 2557, + "think_completion_length": 46.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.28125, + "epoch": 4.320404721753794, + "grad_norm": 6.11302937139873, + "kl": 0.5458984375, + "learning_rate": 1.3726812816188872e-07, + "loss": 0.0005, + "reward": 3.7231526374816895, + "reward_std": 0.07856985554099083, + "rewards/final_reward": 1.6635399623394522, + "rewards/mask_iou_reward": 0.8317699811697261, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7231525778770447, + "rewards/thk_ans_format_reward": 1.0, + "step": 2558, + "think_completion_length": 45.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.1875, + "epoch": 4.322091062394604, + "grad_norm": 25.653913441069957, + "kl": 0.80078125, + "learning_rate": 1.369308600337268e-07, + "loss": 0.0008, + "reward": 3.6056541204452515, + "reward_std": 0.0955022219568491, + "rewards/final_reward": 1.5772293602598686, + "rewards/mask_iou_reward": 0.7886146801299343, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6056540608406067, + "rewards/thk_ans_format_reward": 1.0, + "step": 2559, + "think_completion_length": 41.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.390625, + "epoch": 4.323777403035413, + "grad_norm": 13.998568715322113, + "kl": 0.578125, + "learning_rate": 1.3659359190556492e-07, + "loss": 0.0006, + "reward": 3.3204551935195923, + "reward_std": 0.06416707020252943, + "rewards/final_reward": 1.3612900585715115, + "rewards/mask_iou_reward": 0.6806450292857558, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3204552829265594, + "rewards/thk_ans_format_reward": 1.0, + "step": 2560, + "think_completion_length": 45.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.171875, + "epoch": 4.325463743676223, + "grad_norm": 9.274193578613756, + "kl": 0.541015625, + "learning_rate": 1.3625632377740303e-07, + "loss": 0.0005, + "reward": 3.2127416133880615, + "reward_std": 0.17955580353736877, + "rewards/final_reward": 1.5627788016579989, + "rewards/mask_iou_reward": 0.7813894008289994, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2283666729927063, + "rewards/thk_ans_format_reward": 0.984375, + "step": 2561, + "think_completion_length": 43.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.796875, + "epoch": 4.327150084317032, + "grad_norm": 6.325127294286153, + "kl": 0.5390625, + "learning_rate": 1.3591905564924115e-07, + "loss": 0.0005, + "reward": 2.9596033096313477, + "reward_std": 0.14582211151719093, + "rewards/final_reward": 0.9671578490251985, + "rewards/mask_iou_reward": 0.48357892451259926, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9596033245325089, + "rewards/thk_ans_format_reward": 1.0, + "step": 2562, + "think_completion_length": 48.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.9375, + "epoch": 4.328836424957841, + "grad_norm": 7.20547111501158, + "kl": 0.619140625, + "learning_rate": 1.3558178752107923e-07, + "loss": 0.0007, + "reward": 3.8558907508850098, + "reward_std": 0.01761279860511422, + "rewards/final_reward": 1.7874402635111308, + "rewards/mask_iou_reward": 0.8937201317555654, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8558905720710754, + "rewards/thk_ans_format_reward": 1.0, + "step": 2563, + "think_completion_length": 41.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.4375, + "epoch": 4.330522765598651, + "grad_norm": 12.15445085266186, + "kl": 0.564453125, + "learning_rate": 1.3524451939291738e-07, + "loss": 0.0006, + "reward": 3.4675991535186768, + "reward_std": 0.12800533324480057, + "rewards/final_reward": 1.8438290283183836, + "rewards/mask_iou_reward": 0.9219145141591918, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4675992727279663, + "rewards/thk_ans_format_reward": 1.0, + "step": 2564, + "think_completion_length": 44.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.625, + "epoch": 4.3322091062394605, + "grad_norm": 22.58463662805342, + "kl": 0.734375, + "learning_rate": 1.3490725126475546e-07, + "loss": 0.0006, + "reward": 3.4586970806121826, + "reward_std": 0.05200528213754296, + "rewards/final_reward": 0.9920415770419135, + "rewards/mask_iou_reward": 0.49602078852095677, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.458696961402893, + "rewards/thk_ans_format_reward": 1.0, + "step": 2565, + "think_completion_length": 42.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.296875, + "epoch": 4.33389544688027, + "grad_norm": 8.36367289134029, + "kl": 0.5703125, + "learning_rate": 1.345699831365936e-07, + "loss": 0.0006, + "reward": 3.0413581132888794, + "reward_std": 0.2051805593073368, + "rewards/final_reward": 1.6693194437875627, + "rewards/mask_iou_reward": 0.8346597218937813, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0413581728935242, + "rewards/thk_ans_format_reward": 1.0, + "step": 2566, + "think_completion_length": 40.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.609375, + "epoch": 4.335581787521079, + "grad_norm": 20.45512318736674, + "kl": 0.6328125, + "learning_rate": 1.342327150084317e-07, + "loss": 0.0006, + "reward": 3.7222063541412354, + "reward_std": 0.133345490321517, + "rewards/final_reward": 1.667593703850473, + "rewards/mask_iou_reward": 0.8337968519252364, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.722206175327301, + "rewards/thk_ans_format_reward": 1.0, + "step": 2567, + "think_completion_length": 40.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.09375, + "epoch": 4.337268128161889, + "grad_norm": 7.646885091994143, + "kl": 0.548828125, + "learning_rate": 1.338954468802698e-07, + "loss": 0.0005, + "reward": 3.547218680381775, + "reward_std": 0.12953244149684906, + "rewards/final_reward": 1.7914925262084225, + "rewards/mask_iou_reward": 0.8957462631042112, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5628437995910645, + "rewards/thk_ans_format_reward": 0.984375, + "step": 2568, + "think_completion_length": 38.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.8125, + "epoch": 4.338954468802698, + "grad_norm": 25.772815722112515, + "kl": 2.287109375, + "learning_rate": 1.3355817875210792e-07, + "loss": 0.0023, + "reward": 3.621209144592285, + "reward_std": 0.02371341548860073, + "rewards/final_reward": 1.2751909611401688, + "rewards/mask_iou_reward": 0.6375954805700844, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6212090253829956, + "rewards/thk_ans_format_reward": 1.0, + "step": 2569, + "think_completion_length": 42.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.5, + "epoch": 4.340640809443507, + "grad_norm": 7.315934620760691, + "kl": 0.63671875, + "learning_rate": 1.3322091062394603e-07, + "loss": 0.0006, + "reward": 3.53988254070282, + "reward_std": 0.07551046088337898, + "rewards/final_reward": 1.3149476657336536, + "rewards/mask_iou_reward": 0.6574738328668268, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5398826003074646, + "rewards/thk_ans_format_reward": 1.0, + "step": 2570, + "think_completion_length": 42.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.890625, + "epoch": 4.3423271500843175, + "grad_norm": 14.984992365474874, + "kl": 0.669921875, + "learning_rate": 1.3288364249578415e-07, + "loss": 0.0007, + "reward": 3.1896384954452515, + "reward_std": 0.024790717288851738, + "rewards/final_reward": 1.6793651401459635, + "rewards/mask_iou_reward": 0.8396825700729817, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1896384358406067, + "rewards/thk_ans_format_reward": 1.0, + "step": 2571, + "think_completion_length": 40.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.5625, + "epoch": 4.344013490725127, + "grad_norm": 11.588965605464049, + "kl": 0.603515625, + "learning_rate": 1.3254637436762226e-07, + "loss": 0.0006, + "reward": 3.5344064235687256, + "reward_std": 0.08637862093746662, + "rewards/final_reward": 1.5520570082396912, + "rewards/mask_iou_reward": 0.7760285041198456, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5344064235687256, + "rewards/thk_ans_format_reward": 1.0, + "step": 2572, + "think_completion_length": 44.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.140625, + "epoch": 4.345699831365936, + "grad_norm": 22.296246109705102, + "kl": 0.623046875, + "learning_rate": 1.3220910623946035e-07, + "loss": 0.0006, + "reward": 3.57146418094635, + "reward_std": 0.010422832798212767, + "rewards/final_reward": 1.2223940986785407, + "rewards/mask_iou_reward": 0.6111970493392703, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5714643001556396, + "rewards/thk_ans_format_reward": 1.0, + "step": 2573, + "think_completion_length": 43.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.203125, + "epoch": 4.347386172006745, + "grad_norm": 13.634815728942376, + "kl": 0.578125, + "learning_rate": 1.318718381112985e-07, + "loss": 0.0006, + "reward": 3.5449386835098267, + "reward_std": 0.05515991151332855, + "rewards/final_reward": 1.2598893903355834, + "rewards/mask_iou_reward": 0.6299446951677917, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.544938564300537, + "rewards/thk_ans_format_reward": 1.0, + "step": 2574, + "think_completion_length": 46.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.203125, + "epoch": 4.349072512647555, + "grad_norm": 8.606075795859018, + "kl": 0.60546875, + "learning_rate": 1.3153456998313657e-07, + "loss": 0.0007, + "reward": 3.432936906814575, + "reward_std": 0.14704424515366554, + "rewards/final_reward": 1.7833737091697128, + "rewards/mask_iou_reward": 0.8916868545848564, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.432936668395996, + "rewards/thk_ans_format_reward": 1.0, + "step": 2575, + "think_completion_length": 49.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.78125, + "epoch": 4.350758853288364, + "grad_norm": 14.03312107127187, + "kl": 0.572265625, + "learning_rate": 1.3119730185497472e-07, + "loss": 0.0006, + "reward": 3.3003830909729004, + "reward_std": 0.2684507966041565, + "rewards/final_reward": 1.754745717307658, + "rewards/mask_iou_reward": 0.877372858653829, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3003832697868347, + "rewards/thk_ans_format_reward": 1.0, + "step": 2576, + "think_completion_length": 44.71875 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.921875, + "epoch": 4.3524451939291735, + "grad_norm": 9.563513685502976, + "kl": 0.599609375, + "learning_rate": 1.308600337268128e-07, + "loss": 0.0006, + "reward": 3.6351295709609985, + "reward_std": 0.033396379090845585, + "rewards/final_reward": 1.834490034594674, + "rewards/mask_iou_reward": 0.917245017297337, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6351295113563538, + "rewards/thk_ans_format_reward": 1.0, + "step": 2577, + "think_completion_length": 43.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.96875, + "epoch": 4.354131534569984, + "grad_norm": 9.94042408918705, + "kl": 0.6171875, + "learning_rate": 1.3052276559865092e-07, + "loss": 0.0007, + "reward": 3.280154228210449, + "reward_std": 0.08011075738613727, + "rewards/final_reward": 1.7138860590023135, + "rewards/mask_iou_reward": 0.8569430295011567, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2801542282104492, + "rewards/thk_ans_format_reward": 1.0, + "step": 2578, + "think_completion_length": 41.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.09375, + "epoch": 4.355817875210793, + "grad_norm": 15.375330400711096, + "kl": 0.5859375, + "learning_rate": 1.3018549747048903e-07, + "loss": 0.0006, + "reward": 3.6954853534698486, + "reward_std": 0.016146198846399784, + "rewards/final_reward": 1.574730574071296, + "rewards/mask_iou_reward": 0.787365287035648, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.695485234260559, + "rewards/thk_ans_format_reward": 1.0, + "step": 2579, + "think_completion_length": 45.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.234375, + "epoch": 4.357504215851602, + "grad_norm": 14.048520592159923, + "kl": 0.59765625, + "learning_rate": 1.2984822934232714e-07, + "loss": 0.0006, + "reward": 2.707605004310608, + "reward_std": 0.04226122272666544, + "rewards/final_reward": 1.4152098919735496, + "rewards/mask_iou_reward": 0.7076049459867748, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7076049447059631, + "rewards/thk_ans_format_reward": 1.0, + "step": 2580, + "think_completion_length": 46.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.015625, + "epoch": 4.359190556492411, + "grad_norm": 9.831350349061688, + "kl": 0.61328125, + "learning_rate": 1.2951096121416523e-07, + "loss": 0.0007, + "reward": 3.4770760536193848, + "reward_std": 0.1615507616661489, + "rewards/final_reward": 1.2932119503645565, + "rewards/mask_iou_reward": 0.6466059751822782, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.47707599401474, + "rewards/thk_ans_format_reward": 1.0, + "step": 2581, + "think_completion_length": 38.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.359375, + "epoch": 4.360876897133221, + "grad_norm": 11.50889866748189, + "kl": 0.591796875, + "learning_rate": 1.2917369308600337e-07, + "loss": 0.0006, + "reward": 3.314904808998108, + "reward_std": 0.06304793432354927, + "rewards/final_reward": 1.483831413195893, + "rewards/mask_iou_reward": 0.7419157065979465, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3149048686027527, + "rewards/thk_ans_format_reward": 1.0, + "step": 2582, + "think_completion_length": 47.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.65625, + "epoch": 4.36256323777403, + "grad_norm": 7.46278891755736, + "kl": 0.5078125, + "learning_rate": 1.2883642495784146e-07, + "loss": 0.0005, + "reward": 3.532469868659973, + "reward_std": 0.1696995971724391, + "rewards/final_reward": 1.5694744475666043, + "rewards/mask_iou_reward": 0.7847372237833021, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.532469630241394, + "rewards/thk_ans_format_reward": 1.0, + "step": 2583, + "think_completion_length": 52.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.96875, + "epoch": 4.36424957841484, + "grad_norm": 16.890277065484163, + "kl": 0.5859375, + "learning_rate": 1.284991568296796e-07, + "loss": 0.0006, + "reward": 3.673448085784912, + "reward_std": 0.03351970575749874, + "rewards/final_reward": 1.641488471538079, + "rewards/mask_iou_reward": 0.8207442357690395, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6734481453895569, + "rewards/thk_ans_format_reward": 1.0, + "step": 2584, + "think_completion_length": 41.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.46875, + "epoch": 4.36593591905565, + "grad_norm": 21.628494381474855, + "kl": 0.64453125, + "learning_rate": 1.2816188870151771e-07, + "loss": 0.0006, + "reward": 3.265789270401001, + "reward_std": 0.0905698649585247, + "rewards/final_reward": 0.8923392086039716, + "rewards/mask_iou_reward": 0.4461696043019858, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2657892405986786, + "rewards/thk_ans_format_reward": 1.0, + "step": 2585, + "think_completion_length": 44.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.5625, + "epoch": 4.367622259696459, + "grad_norm": 10.898855924244831, + "kl": 0.607421875, + "learning_rate": 1.278246205733558e-07, + "loss": 0.0006, + "reward": 3.753677248954773, + "reward_std": 0.16097071999683976, + "rewards/final_reward": 1.9405172550981111, + "rewards/mask_iou_reward": 0.9702586275490556, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7536773085594177, + "rewards/thk_ans_format_reward": 1.0, + "step": 2586, + "think_completion_length": 40.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.84375, + "epoch": 4.369308600337268, + "grad_norm": 13.241927572902359, + "kl": 0.591796875, + "learning_rate": 1.2748735244519394e-07, + "loss": 0.0006, + "reward": 3.252161383628845, + "reward_std": 0.15102306054905057, + "rewards/final_reward": 1.6813825703169905, + "rewards/mask_iou_reward": 0.8406912851584952, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2521612644195557, + "rewards/thk_ans_format_reward": 1.0, + "step": 2587, + "think_completion_length": 47.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.03125, + "epoch": 4.370994940978077, + "grad_norm": 7.9241462399636635, + "kl": 0.5625, + "learning_rate": 1.2715008431703203e-07, + "loss": 0.0006, + "reward": 3.655425548553467, + "reward_std": 0.17257796972990036, + "rewards/final_reward": 1.7522866017704373, + "rewards/mask_iou_reward": 0.8761433008852186, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.655425488948822, + "rewards/thk_ans_format_reward": 1.0, + "step": 2588, + "think_completion_length": 42.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.078125, + "epoch": 4.372681281618887, + "grad_norm": 11.05652046882899, + "kl": 0.603515625, + "learning_rate": 1.2681281618887017e-07, + "loss": 0.0006, + "reward": 3.748861074447632, + "reward_std": 0.09842104464769363, + "rewards/final_reward": 1.6676742514306229, + "rewards/mask_iou_reward": 0.8338371257153114, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7488611340522766, + "rewards/thk_ans_format_reward": 1.0, + "step": 2589, + "think_completion_length": 45.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.796875, + "epoch": 4.3743676222596966, + "grad_norm": 16.919804718328106, + "kl": 0.564453125, + "learning_rate": 1.2647554806070826e-07, + "loss": 0.0006, + "reward": 3.2739087343215942, + "reward_std": 0.24324085749685764, + "rewards/final_reward": 1.0611457163072324, + "rewards/mask_iou_reward": 0.5305728581536162, + "rewards/sam_format_reward": 0.9375, + "rewards/sam_reward_func_ultra": 1.3364086747169495, + "rewards/thk_ans_format_reward": 1.0, + "step": 2590, + "think_completion_length": 41.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.875, + "epoch": 4.376053962900506, + "grad_norm": 8.859520000216387, + "kl": 0.58203125, + "learning_rate": 1.2613827993254637e-07, + "loss": 0.0006, + "reward": 3.1826682090759277, + "reward_std": 0.11050739884376526, + "rewards/final_reward": 0.630540052892013, + "rewards/mask_iou_reward": 0.3152700264460065, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1826683282852173, + "rewards/thk_ans_format_reward": 1.0, + "step": 2591, + "think_completion_length": 50.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.921875, + "epoch": 4.377740303541315, + "grad_norm": 43.66077845280807, + "kl": 0.59375, + "learning_rate": 1.2580101180438448e-07, + "loss": 0.0006, + "reward": 3.089726448059082, + "reward_std": 0.04616999439895153, + "rewards/final_reward": 0.9430638960374498, + "rewards/mask_iou_reward": 0.4715319480187249, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.089726448059082, + "rewards/thk_ans_format_reward": 1.0, + "step": 2592, + "think_completion_length": 45.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.390625, + "epoch": 4.379426644182125, + "grad_norm": 111.18989282333878, + "kl": 0.56640625, + "learning_rate": 1.254637436762226e-07, + "loss": 0.0006, + "reward": 3.495315194129944, + "reward_std": 0.041504111140966415, + "rewards/final_reward": 1.15804746014589, + "rewards/mask_iou_reward": 0.579023730072945, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.495315134525299, + "rewards/thk_ans_format_reward": 1.0, + "step": 2593, + "think_completion_length": 38.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.453125, + "epoch": 4.381112984822934, + "grad_norm": 11.511121528779169, + "kl": 0.5390625, + "learning_rate": 1.2512647554806069e-07, + "loss": 0.0005, + "reward": 3.4886449575424194, + "reward_std": 0.14284783974289894, + "rewards/final_reward": 1.4388147444293953, + "rewards/mask_iou_reward": 0.7194073722146976, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4886450171470642, + "rewards/thk_ans_format_reward": 1.0, + "step": 2594, + "think_completion_length": 46.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.65625, + "epoch": 4.382799325463743, + "grad_norm": 6.5032283973923, + "kl": 0.560546875, + "learning_rate": 1.2478920741989883e-07, + "loss": 0.0006, + "reward": 3.439482092857361, + "reward_std": 0.03333376161754131, + "rewards/final_reward": 1.4244483775725096, + "rewards/mask_iou_reward": 0.7122241887862548, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4394821524620056, + "rewards/thk_ans_format_reward": 1.0, + "step": 2595, + "think_completion_length": 45.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.296875, + "epoch": 4.3844856661045535, + "grad_norm": 7.433982934696911, + "kl": 0.623046875, + "learning_rate": 1.2445193929173694e-07, + "loss": 0.0006, + "reward": 2.9446091651916504, + "reward_std": 0.021507996134459972, + "rewards/final_reward": 0.7678388857192303, + "rewards/mask_iou_reward": 0.38391944285961516, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9446091949939728, + "rewards/thk_ans_format_reward": 1.0, + "step": 2596, + "think_completion_length": 43.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.5625, + "epoch": 4.386172006745363, + "grad_norm": 30.213404739416212, + "kl": 0.591796875, + "learning_rate": 1.2411467116357503e-07, + "loss": 0.0006, + "reward": 3.2194563150405884, + "reward_std": 0.12624008324928582, + "rewards/final_reward": 0.9775399201988229, + "rewards/mask_iou_reward": 0.4887699600994114, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2194563150405884, + "rewards/thk_ans_format_reward": 1.0, + "step": 2597, + "think_completion_length": 37.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.265625, + "epoch": 4.387858347386172, + "grad_norm": 27.848311657695973, + "kl": 0.5859375, + "learning_rate": 1.2377740303541314e-07, + "loss": 0.0006, + "reward": 3.732755422592163, + "reward_std": 0.027668212191201746, + "rewards/final_reward": 1.6713844987182405, + "rewards/mask_iou_reward": 0.8356922493591202, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7327553629875183, + "rewards/thk_ans_format_reward": 1.0, + "step": 2598, + "think_completion_length": 41.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.4375, + "epoch": 4.389544688026981, + "grad_norm": 8.426157508025339, + "kl": 0.6171875, + "learning_rate": 1.2344013490725125e-07, + "loss": 0.0006, + "reward": 2.883462429046631, + "reward_std": 0.027737990021705627, + "rewards/final_reward": 1.1196981285235068, + "rewards/mask_iou_reward": 0.5598490642617534, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8834625482559204, + "rewards/thk_ans_format_reward": 1.0, + "step": 2599, + "think_completion_length": 44.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.1875, + "epoch": 4.391231028667791, + "grad_norm": 12.995200142244789, + "kl": 0.607421875, + "learning_rate": 1.2310286677908937e-07, + "loss": 0.0006, + "reward": 3.072115898132324, + "reward_std": 0.22114570438861847, + "rewards/final_reward": 0.7345745190405786, + "rewards/mask_iou_reward": 0.3672872595202893, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0721158385276794, + "rewards/thk_ans_format_reward": 1.0, + "step": 2600, + "think_completion_length": 50.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.890625, + "epoch": 4.3929173693086, + "grad_norm": 13.855587724723787, + "kl": 0.568359375, + "learning_rate": 1.2276559865092748e-07, + "loss": 0.0006, + "reward": 3.5711183547973633, + "reward_std": 0.05036383680999279, + "rewards/final_reward": 1.4760295484223664, + "rewards/mask_iou_reward": 0.7380147742111832, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5711183547973633, + "rewards/thk_ans_format_reward": 1.0, + "step": 2601, + "think_completion_length": 42.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.546875, + "epoch": 4.3946037099494095, + "grad_norm": 5.755886009335213, + "kl": 0.548828125, + "learning_rate": 1.224283305227656e-07, + "loss": 0.0005, + "reward": 3.100886583328247, + "reward_std": 0.08746401220560074, + "rewards/final_reward": 1.187160076599616, + "rewards/mask_iou_reward": 0.593580038299808, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1008866727352142, + "rewards/thk_ans_format_reward": 1.0, + "step": 2602, + "think_completion_length": 40.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.578125, + "epoch": 4.39629005059022, + "grad_norm": 39.64395810103559, + "kl": 1.32421875, + "learning_rate": 1.220910623946037e-07, + "loss": 0.0013, + "reward": 3.728816032409668, + "reward_std": 0.06418004259467125, + "rewards/final_reward": 1.7604291987169278, + "rewards/mask_iou_reward": 0.8802145993584639, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7288159728050232, + "rewards/thk_ans_format_reward": 1.0, + "step": 2603, + "think_completion_length": 42.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.734375, + "epoch": 4.397976391231029, + "grad_norm": 9.62940263705133, + "kl": 0.533203125, + "learning_rate": 1.2175379426644182e-07, + "loss": 0.0005, + "reward": 3.2988163232803345, + "reward_std": 0.12963218614459038, + "rewards/final_reward": 1.4517221758943717, + "rewards/mask_iou_reward": 0.7258610879471858, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.298816204071045, + "rewards/thk_ans_format_reward": 1.0, + "step": 2604, + "think_completion_length": 40.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.296875, + "epoch": 4.399662731871838, + "grad_norm": 7.593131325449928, + "kl": 0.716796875, + "learning_rate": 1.214165261382799e-07, + "loss": 0.0007, + "reward": 3.7163710594177246, + "reward_std": 0.08841110952198505, + "rewards/final_reward": 1.6647490690087037, + "rewards/mask_iou_reward": 0.8323745345043518, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7163708806037903, + "rewards/thk_ans_format_reward": 1.0, + "step": 2605, + "think_completion_length": 47.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.4375, + "epoch": 4.401349072512647, + "grad_norm": 5.2326008033600635, + "kl": 0.609375, + "learning_rate": 1.2107925801011802e-07, + "loss": 0.0006, + "reward": 2.621520757675171, + "reward_std": 0.13888922333717346, + "rewards/final_reward": 0.48397687984975585, + "rewards/mask_iou_reward": 0.24198843992487792, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.621520608663559, + "rewards/thk_ans_format_reward": 1.0, + "step": 2606, + "think_completion_length": 39.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.6875, + "epoch": 4.403035413153457, + "grad_norm": 6.214061190619599, + "kl": 0.626953125, + "learning_rate": 1.2074198988195614e-07, + "loss": 0.0006, + "reward": 3.549259662628174, + "reward_std": 0.08032980561256409, + "rewards/final_reward": 1.4888046361899203, + "rewards/mask_iou_reward": 0.7444023180949602, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5492598414421082, + "rewards/thk_ans_format_reward": 1.0, + "step": 2607, + "think_completion_length": 49.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.09375, + "epoch": 4.4047217537942664, + "grad_norm": 6.938380100370504, + "kl": 0.609375, + "learning_rate": 1.2040472175379425e-07, + "loss": 0.0006, + "reward": 3.187752604484558, + "reward_std": 0.05385738704353571, + "rewards/final_reward": 1.4650717276018062, + "rewards/mask_iou_reward": 0.7325358638009031, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1877525746822357, + "rewards/thk_ans_format_reward": 1.0, + "step": 2608, + "think_completion_length": 36.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.34375, + "epoch": 4.406408094435076, + "grad_norm": 40.97108084569082, + "kl": 0.609375, + "learning_rate": 1.2006745362563237e-07, + "loss": 0.0006, + "reward": 3.8264540433883667, + "reward_std": 0.1715675238519907, + "rewards/final_reward": 1.9172816043021799, + "rewards/mask_iou_reward": 0.9586408021510899, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.826453983783722, + "rewards/thk_ans_format_reward": 1.0, + "step": 2609, + "think_completion_length": 42.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.15625, + "epoch": 4.408094435075886, + "grad_norm": 8.004434533128602, + "kl": 0.548828125, + "learning_rate": 1.1973018549747048e-07, + "loss": 0.0005, + "reward": 2.7611573934555054, + "reward_std": 0.22699527442455292, + "rewards/final_reward": 0.8863231089658016, + "rewards/mask_iou_reward": 0.4431615544829008, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7611575424671173, + "rewards/thk_ans_format_reward": 1.0, + "step": 2610, + "think_completion_length": 41.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.65625, + "epoch": 4.409780775716695, + "grad_norm": 5.520014277334112, + "kl": 0.56640625, + "learning_rate": 1.193929173693086e-07, + "loss": 0.0006, + "reward": 3.1015899181365967, + "reward_std": 0.11771095357835293, + "rewards/final_reward": 0.458460322402633, + "rewards/mask_iou_reward": 0.2292301612013165, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.10158970952034, + "rewards/thk_ans_format_reward": 1.0, + "step": 2611, + "think_completion_length": 45.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.359375, + "epoch": 4.411467116357504, + "grad_norm": 9.160802093351666, + "kl": 0.541015625, + "learning_rate": 1.1905564924114671e-07, + "loss": 0.0005, + "reward": 3.1726194620132446, + "reward_std": 0.0722682923078537, + "rewards/final_reward": 1.6683353795083833, + "rewards/mask_iou_reward": 0.8341676897541916, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1726195812225342, + "rewards/thk_ans_format_reward": 1.0, + "step": 2612, + "think_completion_length": 48.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.609375, + "epoch": 4.413153456998313, + "grad_norm": 9.814321839776376, + "kl": 0.59765625, + "learning_rate": 1.1871838111298482e-07, + "loss": 0.0006, + "reward": 3.2006527185440063, + "reward_std": 0.4120529443025589, + "rewards/final_reward": 1.0384373302250594, + "rewards/mask_iou_reward": 0.5192186651125297, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2006526589393616, + "rewards/thk_ans_format_reward": 1.0, + "step": 2613, + "think_completion_length": 42.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.453125, + "epoch": 4.414839797639123, + "grad_norm": 9.044116285812068, + "kl": 0.513671875, + "learning_rate": 1.1838111298482292e-07, + "loss": 0.0005, + "reward": 3.4873398542404175, + "reward_std": 0.09251206181943417, + "rewards/final_reward": 1.3417746076498847, + "rewards/mask_iou_reward": 0.6708873038249423, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4873397946357727, + "rewards/thk_ans_format_reward": 1.0, + "step": 2614, + "think_completion_length": 41.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.6875, + "epoch": 4.416526138279933, + "grad_norm": 11.461383350366239, + "kl": 0.541015625, + "learning_rate": 1.1804384485666104e-07, + "loss": 0.0005, + "reward": 3.5904886722564697, + "reward_std": 0.07166932441759855, + "rewards/final_reward": 1.625364095836272, + "rewards/mask_iou_reward": 0.812682047918136, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5904887318611145, + "rewards/thk_ans_format_reward": 1.0, + "step": 2615, + "think_completion_length": 49.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.125, + "epoch": 4.418212478920742, + "grad_norm": 103.55393362384335, + "kl": 0.625, + "learning_rate": 1.1770657672849915e-07, + "loss": 0.0006, + "reward": 3.6286131143569946, + "reward_std": 0.06564396899193525, + "rewards/final_reward": 1.8305318926860772, + "rewards/mask_iou_reward": 0.9152659463430386, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6286131739616394, + "rewards/thk_ans_format_reward": 1.0, + "step": 2616, + "think_completion_length": 39.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.59375, + "epoch": 4.419898819561552, + "grad_norm": 13.1873667863401, + "kl": 0.6015625, + "learning_rate": 1.1736930860033726e-07, + "loss": 0.0006, + "reward": 3.421006202697754, + "reward_std": 0.32600878179073334, + "rewards/final_reward": 1.6724000948973066, + "rewards/mask_iou_reward": 0.8362000474486533, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4210062623023987, + "rewards/thk_ans_format_reward": 1.0, + "step": 2617, + "think_completion_length": 44.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.265625, + "epoch": 4.421585160202361, + "grad_norm": 9.256869268758377, + "kl": 0.59765625, + "learning_rate": 1.1703204047217538e-07, + "loss": 0.0006, + "reward": 3.6909961700439453, + "reward_std": 0.016168599016964436, + "rewards/final_reward": 1.5668830821447353, + "rewards/mask_iou_reward": 0.7834415410723676, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6909962892532349, + "rewards/thk_ans_format_reward": 1.0, + "step": 2618, + "think_completion_length": 46.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.703125, + "epoch": 4.42327150084317, + "grad_norm": 10.38758359315407, + "kl": 0.673828125, + "learning_rate": 1.1669477234401348e-07, + "loss": 0.0007, + "reward": 3.471518635749817, + "reward_std": 0.03896564897149801, + "rewards/final_reward": 1.2505938336831524, + "rewards/mask_iou_reward": 0.6252969168415762, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.471518635749817, + "rewards/thk_ans_format_reward": 1.0, + "step": 2619, + "think_completion_length": 42.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.015625, + "epoch": 4.424957841483979, + "grad_norm": 13.067336110176923, + "kl": 0.615234375, + "learning_rate": 1.1635750421585159e-07, + "loss": 0.0006, + "reward": 3.089142322540283, + "reward_std": 0.26153238862752914, + "rewards/final_reward": 1.465048299164459, + "rewards/mask_iou_reward": 0.7325241495822294, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0891424119472504, + "rewards/thk_ans_format_reward": 1.0, + "step": 2620, + "think_completion_length": 44.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.78125, + "epoch": 4.4266441821247895, + "grad_norm": 8.069392480164767, + "kl": 0.5390625, + "learning_rate": 1.160202360876897e-07, + "loss": 0.0005, + "reward": 2.5934301614761353, + "reward_std": 0.11824558675289154, + "rewards/final_reward": 0.417071096471484, + "rewards/mask_iou_reward": 0.208535548235742, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5934301316738129, + "rewards/thk_ans_format_reward": 1.0, + "step": 2621, + "think_completion_length": 46.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.671875, + "epoch": 4.428330522765599, + "grad_norm": 8.293984205803936, + "kl": 0.548828125, + "learning_rate": 1.1568296795952782e-07, + "loss": 0.0005, + "reward": 2.568539619445801, + "reward_std": 0.1302037239074707, + "rewards/final_reward": 0.7678146034402551, + "rewards/mask_iou_reward": 0.38390730172012755, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.568539634346962, + "rewards/thk_ans_format_reward": 1.0, + "step": 2622, + "think_completion_length": 45.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.046875, + "epoch": 4.430016863406408, + "grad_norm": 17.131262575480264, + "kl": 0.5703125, + "learning_rate": 1.1534569983136592e-07, + "loss": 0.0006, + "reward": 3.780028223991394, + "reward_std": 0.03920180618297309, + "rewards/final_reward": 1.7282832905265988, + "rewards/mask_iou_reward": 0.8641416452632994, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.780028223991394, + "rewards/thk_ans_format_reward": 1.0, + "step": 2623, + "think_completion_length": 43.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.25, + "epoch": 4.431703204047217, + "grad_norm": 6.430441929245077, + "kl": 0.4765625, + "learning_rate": 1.1500843170320403e-07, + "loss": 0.0005, + "reward": 2.705838918685913, + "reward_std": 0.11969677184242755, + "rewards/final_reward": 0.5233868673069243, + "rewards/mask_iou_reward": 0.26169343365346215, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7058388888835907, + "rewards/thk_ans_format_reward": 1.0, + "step": 2624, + "think_completion_length": 45.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.984375, + "epoch": 4.433389544688027, + "grad_norm": 31.678806652203008, + "kl": 0.650390625, + "learning_rate": 1.1467116357504215e-07, + "loss": 0.0007, + "reward": 3.651994466781616, + "reward_std": 0.028817713260650635, + "rewards/final_reward": 1.6218849618589297, + "rewards/mask_iou_reward": 0.8109424809294649, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.651994526386261, + "rewards/thk_ans_format_reward": 1.0, + "step": 2625, + "think_completion_length": 45.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.875, + "epoch": 4.435075885328836, + "grad_norm": 6.252600044446703, + "kl": 0.515625, + "learning_rate": 1.1433389544688026e-07, + "loss": 0.0005, + "reward": 3.4030381441116333, + "reward_std": 0.11711015552282333, + "rewards/final_reward": 1.3276957990905338, + "rewards/mask_iou_reward": 0.6638478995452669, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.403038203716278, + "rewards/thk_ans_format_reward": 1.0, + "step": 2626, + "think_completion_length": 45.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.625, + "epoch": 4.4367622259696455, + "grad_norm": 8.745195964000725, + "kl": 0.63671875, + "learning_rate": 1.1399662731871836e-07, + "loss": 0.0006, + "reward": 3.6559841632843018, + "reward_std": 0.025955231860280037, + "rewards/final_reward": 1.5211647887452338, + "rewards/mask_iou_reward": 0.7605823943726169, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6559841632843018, + "rewards/thk_ans_format_reward": 1.0, + "step": 2627, + "think_completion_length": 42.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.5, + "epoch": 4.438448566610456, + "grad_norm": 9.671689509302032, + "kl": 0.4755859375, + "learning_rate": 1.1365935919055649e-07, + "loss": 0.0005, + "reward": 3.4869346618652344, + "reward_std": 0.11515981703996658, + "rewards/final_reward": 1.492222922357928, + "rewards/mask_iou_reward": 0.746111461178964, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4869346618652344, + "rewards/thk_ans_format_reward": 1.0, + "step": 2628, + "think_completion_length": 42.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.34375, + "epoch": 4.440134907251265, + "grad_norm": 23.11696267408195, + "kl": 0.669921875, + "learning_rate": 1.133220910623946e-07, + "loss": 0.0007, + "reward": 3.32826566696167, + "reward_std": 0.21583092957735062, + "rewards/final_reward": 1.651356880883315, + "rewards/mask_iou_reward": 0.8256784404416575, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.32826566696167, + "rewards/thk_ans_format_reward": 1.0, + "step": 2629, + "think_completion_length": 41.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.96875, + "epoch": 4.441821247892074, + "grad_norm": 13.057917991823976, + "kl": 0.609375, + "learning_rate": 1.1298482293423272e-07, + "loss": 0.0006, + "reward": 3.688571810722351, + "reward_std": 0.006875853752717376, + "rewards/final_reward": 1.6953000644129448, + "rewards/mask_iou_reward": 0.8476500322064724, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6885717511177063, + "rewards/thk_ans_format_reward": 1.0, + "step": 2630, + "think_completion_length": 40.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.203125, + "epoch": 4.443507588532883, + "grad_norm": 8.93951053796139, + "kl": 0.60546875, + "learning_rate": 1.1264755480607083e-07, + "loss": 0.0006, + "reward": 3.3038218021392822, + "reward_std": 0.06475062295794487, + "rewards/final_reward": 0.7706906485427694, + "rewards/mask_iou_reward": 0.3853453242713847, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.303821861743927, + "rewards/thk_ans_format_reward": 1.0, + "step": 2631, + "think_completion_length": 47.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.4375, + "epoch": 4.445193929173693, + "grad_norm": 8.98465106731901, + "kl": 0.5634765625, + "learning_rate": 1.1231028667790893e-07, + "loss": 0.0006, + "reward": 3.3895570039749146, + "reward_std": 0.07609788700938225, + "rewards/final_reward": 1.0268260569306906, + "rewards/mask_iou_reward": 0.5134130284653453, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3895567655563354, + "rewards/thk_ans_format_reward": 1.0, + "step": 2632, + "think_completion_length": 48.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.375, + "epoch": 4.4468802698145025, + "grad_norm": 8.868591815915336, + "kl": 0.603515625, + "learning_rate": 1.1197301854974705e-07, + "loss": 0.0006, + "reward": 3.3486082553863525, + "reward_std": 0.04780184803530574, + "rewards/final_reward": 1.0677600247327752, + "rewards/mask_iou_reward": 0.5338800123663876, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3486083149909973, + "rewards/thk_ans_format_reward": 1.0, + "step": 2633, + "think_completion_length": 41.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.65625, + "epoch": 4.448566610455312, + "grad_norm": 7.070680793502134, + "kl": 0.5625, + "learning_rate": 1.1163575042158516e-07, + "loss": 0.0006, + "reward": 3.4986801147460938, + "reward_std": 0.19939835742115974, + "rewards/final_reward": 1.9185519446167523, + "rewards/mask_iou_reward": 0.9592759723083761, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4986801147460938, + "rewards/thk_ans_format_reward": 1.0, + "step": 2634, + "think_completion_length": 46.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.4375, + "epoch": 4.450252951096122, + "grad_norm": 12.153644101177255, + "kl": 0.58984375, + "learning_rate": 1.1129848229342327e-07, + "loss": 0.0006, + "reward": 3.038814663887024, + "reward_std": 0.20692519284784794, + "rewards/final_reward": 1.2421937835092174, + "rewards/mask_iou_reward": 0.6210968917546087, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.038814753293991, + "rewards/thk_ans_format_reward": 1.0, + "step": 2635, + "think_completion_length": 42.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.125, + "epoch": 4.451939291736931, + "grad_norm": 10.058118041299805, + "kl": 0.603515625, + "learning_rate": 1.1096121416526137e-07, + "loss": 0.0006, + "reward": 3.3655534982681274, + "reward_std": 0.04708591848611832, + "rewards/final_reward": 1.5861924009273762, + "rewards/mask_iou_reward": 0.7930962004636881, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3655535578727722, + "rewards/thk_ans_format_reward": 1.0, + "step": 2636, + "think_completion_length": 41.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.71875, + "epoch": 4.45362563237774, + "grad_norm": 11.354474998702795, + "kl": 0.533203125, + "learning_rate": 1.1062394603709949e-07, + "loss": 0.0005, + "reward": 3.6468669176101685, + "reward_std": 0.06775764841586351, + "rewards/final_reward": 1.629458788075444, + "rewards/mask_iou_reward": 0.814729394037722, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6468668580055237, + "rewards/thk_ans_format_reward": 1.0, + "step": 2637, + "think_completion_length": 44.1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.796875, + "epoch": 4.455311973018549, + "grad_norm": 6.542571543632283, + "kl": 0.59765625, + "learning_rate": 1.102866779089376e-07, + "loss": 0.0006, + "reward": 2.995203733444214, + "reward_std": 0.05667147785425186, + "rewards/final_reward": 0.824913530327498, + "rewards/mask_iou_reward": 0.412456765163749, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9952037483453751, + "rewards/thk_ans_format_reward": 1.0, + "step": 2638, + "think_completion_length": 45.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.609375, + "epoch": 4.456998313659359, + "grad_norm": 6.756942011236939, + "kl": 0.5390625, + "learning_rate": 1.0994940978077572e-07, + "loss": 0.0005, + "reward": 3.3548476696014404, + "reward_std": 0.07482127472758293, + "rewards/final_reward": 1.7545954018870025, + "rewards/mask_iou_reward": 0.8772977009435012, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.354847490787506, + "rewards/thk_ans_format_reward": 1.0, + "step": 2639, + "think_completion_length": 43.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.4375, + "epoch": 4.458684654300169, + "grad_norm": 11.833040497849026, + "kl": 0.5185546875, + "learning_rate": 1.0961214165261383e-07, + "loss": 0.0005, + "reward": 3.5759642124176025, + "reward_std": 0.027602959889918566, + "rewards/final_reward": 1.5843839322703575, + "rewards/mask_iou_reward": 0.7921919661351787, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5759642124176025, + "rewards/thk_ans_format_reward": 1.0, + "step": 2640, + "think_completion_length": 40.90625 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.859375, + "epoch": 4.460370994940978, + "grad_norm": 16.623365225933398, + "kl": 0.5859375, + "learning_rate": 1.0927487352445193e-07, + "loss": 0.0005, + "reward": 3.6343547105789185, + "reward_std": 0.021240360219962895, + "rewards/final_reward": 1.642886709337712, + "rewards/mask_iou_reward": 0.821443354668856, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6343547701835632, + "rewards/thk_ans_format_reward": 1.0, + "step": 2641, + "think_completion_length": 45.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.546875, + "epoch": 4.462057335581788, + "grad_norm": 5.592280894385483, + "kl": 0.4931640625, + "learning_rate": 1.0893760539629004e-07, + "loss": 0.0005, + "reward": 3.7818983793258667, + "reward_std": 0.11853981949388981, + "rewards/final_reward": 1.88099498342845, + "rewards/mask_iou_reward": 0.940497491714225, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7818983793258667, + "rewards/thk_ans_format_reward": 1.0, + "step": 2642, + "think_completion_length": 42.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.484375, + "epoch": 4.463743676222597, + "grad_norm": 8.582496126745752, + "kl": 0.61328125, + "learning_rate": 1.0860033726812816e-07, + "loss": 0.0006, + "reward": 3.078013300895691, + "reward_std": 0.1424650065600872, + "rewards/final_reward": 0.8963709606380095, + "rewards/mask_iou_reward": 0.44818548031900474, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0780134201049805, + "rewards/thk_ans_format_reward": 1.0, + "step": 2643, + "think_completion_length": 41.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.828125, + "epoch": 4.465430016863406, + "grad_norm": 46.17055861852506, + "kl": 0.552734375, + "learning_rate": 1.0826306913996627e-07, + "loss": 0.0005, + "reward": 3.219741702079773, + "reward_std": 0.26992932334542274, + "rewards/final_reward": 1.4363236266823287, + "rewards/mask_iou_reward": 0.7181618133411644, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2197417914867401, + "rewards/thk_ans_format_reward": 1.0, + "step": 2644, + "think_completion_length": 46.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.859375, + "epoch": 4.467116357504215, + "grad_norm": 16.039393307939402, + "kl": 0.646484375, + "learning_rate": 1.0792580101180437e-07, + "loss": 0.0007, + "reward": 3.00642192363739, + "reward_std": 0.0911721233278513, + "rewards/final_reward": 1.3042461841878612, + "rewards/mask_iou_reward": 0.6521230920939306, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0064219534397125, + "rewards/thk_ans_format_reward": 1.0, + "step": 2645, + "think_completion_length": 44.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.125, + "epoch": 4.4688026981450255, + "grad_norm": 14.585010775776698, + "kl": 0.5703125, + "learning_rate": 1.0758853288364249e-07, + "loss": 0.0005, + "reward": 3.214327096939087, + "reward_std": 0.11271106917411089, + "rewards/final_reward": 1.3228343652584247, + "rewards/mask_iou_reward": 0.6614171826292123, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2143271565437317, + "rewards/thk_ans_format_reward": 1.0, + "step": 2646, + "think_completion_length": 50.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.84375, + "epoch": 4.470489038785835, + "grad_norm": 11.422126760394363, + "kl": 0.609375, + "learning_rate": 1.072512647554806e-07, + "loss": 0.0006, + "reward": 3.3172987699508667, + "reward_std": 0.07890355307608843, + "rewards/final_reward": 1.827619537052433, + "rewards/mask_iou_reward": 0.9138097685262165, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3172988891601562, + "rewards/thk_ans_format_reward": 1.0, + "step": 2647, + "think_completion_length": 44.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.765625, + "epoch": 4.472175379426644, + "grad_norm": 8.19118901068108, + "kl": 0.59375, + "learning_rate": 1.0691399662731871e-07, + "loss": 0.0006, + "reward": 3.2954260110855103, + "reward_std": 0.14317326247692108, + "rewards/final_reward": 1.236711009110651, + "rewards/mask_iou_reward": 0.6183555045553255, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2954260110855103, + "rewards/thk_ans_format_reward": 1.0, + "step": 2648, + "think_completion_length": 44.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.59375, + "epoch": 4.473861720067454, + "grad_norm": 7.837971132263195, + "kl": 0.5625, + "learning_rate": 1.0657672849915683e-07, + "loss": 0.0006, + "reward": 3.2900350093841553, + "reward_std": 0.04385017417371273, + "rewards/final_reward": 1.6600767316864327, + "rewards/mask_iou_reward": 0.8300383658432163, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.290034830570221, + "rewards/thk_ans_format_reward": 1.0, + "step": 2649, + "think_completion_length": 45.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.03125, + "epoch": 4.475548060708263, + "grad_norm": 6.960078569740148, + "kl": 0.619140625, + "learning_rate": 1.0623946037099493e-07, + "loss": 0.0006, + "reward": 3.0506186485290527, + "reward_std": 0.11288509517908096, + "rewards/final_reward": 0.9843538060173483, + "rewards/mask_iou_reward": 0.49217690300867417, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0506186485290527, + "rewards/thk_ans_format_reward": 1.0, + "step": 2650, + "think_completion_length": 45.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.6875, + "epoch": 4.477234401349072, + "grad_norm": 30.315258681626336, + "kl": 0.595703125, + "learning_rate": 1.0590219224283304e-07, + "loss": 0.0006, + "reward": 3.734488844871521, + "reward_std": 0.028505256865173578, + "rewards/final_reward": 1.8352535786063089, + "rewards/mask_iou_reward": 0.9176267893031544, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7344887852668762, + "rewards/thk_ans_format_reward": 1.0, + "step": 2651, + "think_completion_length": 41.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.015625, + "epoch": 4.4789207419898815, + "grad_norm": 20.307580488842333, + "kl": 0.619140625, + "learning_rate": 1.0556492411467116e-07, + "loss": 0.0006, + "reward": 3.286136507987976, + "reward_std": 0.12623700872063637, + "rewards/final_reward": 1.3470083843749305, + "rewards/mask_iou_reward": 0.6735041921874653, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.286136507987976, + "rewards/thk_ans_format_reward": 1.0, + "step": 2652, + "think_completion_length": 41.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.5, + "epoch": 4.480607082630692, + "grad_norm": 9.208181074924948, + "kl": 0.740234375, + "learning_rate": 1.0522765598650927e-07, + "loss": 0.0008, + "reward": 3.2891019582748413, + "reward_std": 0.03242574352771044, + "rewards/final_reward": 0.9127468298879465, + "rewards/mask_iou_reward": 0.45637341494397327, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2891019880771637, + "rewards/thk_ans_format_reward": 1.0, + "step": 2653, + "think_completion_length": 48.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.59375, + "epoch": 4.482293423271501, + "grad_norm": 10.054836029278828, + "kl": 0.5048828125, + "learning_rate": 1.0489038785834737e-07, + "loss": 0.0005, + "reward": 3.685955047607422, + "reward_std": 0.015149123733863235, + "rewards/final_reward": 1.9014045515848474, + "rewards/mask_iou_reward": 0.9507022757924237, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6859549283981323, + "rewards/thk_ans_format_reward": 1.0, + "step": 2654, + "think_completion_length": 41.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.140625, + "epoch": 4.48397976391231, + "grad_norm": 10.076922778528303, + "kl": 0.513671875, + "learning_rate": 1.0455311973018548e-07, + "loss": 0.0004, + "reward": 3.829998254776001, + "reward_std": 0.043199281208217144, + "rewards/final_reward": 1.7935038134722294, + "rewards/mask_iou_reward": 0.8967519067361147, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.829998254776001, + "rewards/thk_ans_format_reward": 1.0, + "step": 2655, + "think_completion_length": 49.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.71875, + "epoch": 4.48566610455312, + "grad_norm": 14.24780204502807, + "kl": 0.619140625, + "learning_rate": 1.042158516020236e-07, + "loss": 0.0006, + "reward": 3.779955506324768, + "reward_std": 0.0681462474167347, + "rewards/final_reward": 1.9018116411955917, + "rewards/mask_iou_reward": 0.9509058205977958, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7799555659294128, + "rewards/thk_ans_format_reward": 1.0, + "step": 2656, + "think_completion_length": 44.09375 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.578125, + "epoch": 4.487352445193929, + "grad_norm": 9.939901597778025, + "kl": 0.7890625, + "learning_rate": 1.0387858347386173e-07, + "loss": 0.0008, + "reward": 3.632006525993347, + "reward_std": 0.016680723056197166, + "rewards/final_reward": 1.846947845788642, + "rewards/mask_iou_reward": 0.923473922894321, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.632006585597992, + "rewards/thk_ans_format_reward": 1.0, + "step": 2657, + "think_completion_length": 42.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.171875, + "epoch": 4.4890387858347385, + "grad_norm": 5.408654799689108, + "kl": 0.591796875, + "learning_rate": 1.0354131534569983e-07, + "loss": 0.0006, + "reward": 3.5690382719039917, + "reward_std": 0.029328839387744665, + "rewards/final_reward": 1.3267483429997529, + "rewards/mask_iou_reward": 0.6633741714998764, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5690380930900574, + "rewards/thk_ans_format_reward": 1.0, + "step": 2658, + "think_completion_length": 44.28125 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.0, + "epoch": 4.490725126475548, + "grad_norm": 9.187404635678938, + "kl": 0.642578125, + "learning_rate": 1.0320404721753794e-07, + "loss": 0.0007, + "reward": 3.5022149085998535, + "reward_std": 0.025782881304621696, + "rewards/final_reward": 1.1732110903545854, + "rewards/mask_iou_reward": 0.5866055451772927, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5022149085998535, + "rewards/thk_ans_format_reward": 1.0, + "step": 2659, + "think_completion_length": 42.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.8125, + "epoch": 4.492411467116358, + "grad_norm": 5.49395932994343, + "kl": 0.515625, + "learning_rate": 1.0286677908937605e-07, + "loss": 0.0005, + "reward": 3.5852386951446533, + "reward_std": 0.09690108336508274, + "rewards/final_reward": 1.3697257018342164, + "rewards/mask_iou_reward": 0.6848628509171082, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5852386355400085, + "rewards/thk_ans_format_reward": 1.0, + "step": 2660, + "think_completion_length": 43.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.40625, + "epoch": 4.494097807757167, + "grad_norm": 5.243002984426864, + "kl": 0.568359375, + "learning_rate": 1.0252951096121417e-07, + "loss": 0.0006, + "reward": 3.3164455890655518, + "reward_std": 0.07848885655403137, + "rewards/final_reward": 0.8967024527831223, + "rewards/mask_iou_reward": 0.44835122639156116, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.316445529460907, + "rewards/thk_ans_format_reward": 1.0, + "step": 2661, + "think_completion_length": 39.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.296875, + "epoch": 4.495784148397976, + "grad_norm": 3.473115692534412, + "kl": 0.51171875, + "learning_rate": 1.0219224283305228e-07, + "loss": 0.0005, + "reward": 3.041918635368347, + "reward_std": 0.19841172359883785, + "rewards/final_reward": 1.1676012257269686, + "rewards/mask_iou_reward": 0.5838006128634843, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 1.073168694972992, + "rewards/thk_ans_format_reward": 1.0, + "step": 2662, + "think_completion_length": 42.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.03125, + "epoch": 4.497470489038786, + "grad_norm": 13.879925985773543, + "kl": 0.5068359375, + "learning_rate": 1.0185497470489038e-07, + "loss": 0.0005, + "reward": 3.432920813560486, + "reward_std": 0.31590735912323, + "rewards/final_reward": 1.1663536039431972, + "rewards/mask_iou_reward": 0.5831768019715986, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.432920753955841, + "rewards/thk_ans_format_reward": 1.0, + "step": 2663, + "think_completion_length": 45.0625 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.15625, + "epoch": 4.499156829679595, + "grad_norm": 7.630152597686654, + "kl": 0.5390625, + "learning_rate": 1.015177065767285e-07, + "loss": 0.0005, + "reward": 3.535985827445984, + "reward_std": 0.21657665446400642, + "rewards/final_reward": 1.5852197319364074, + "rewards/mask_iou_reward": 0.7926098659682037, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5359859466552734, + "rewards/thk_ans_format_reward": 1.0, + "step": 2664, + "think_completion_length": 42.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.5, + "epoch": 4.500843170320405, + "grad_norm": 24.542373673707836, + "kl": 0.61328125, + "learning_rate": 1.0118043844856661e-07, + "loss": 0.0006, + "reward": 3.67924702167511, + "reward_std": 0.06537250243127346, + "rewards/final_reward": 1.5337015185011438, + "rewards/mask_iou_reward": 0.7668507592505719, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6792468428611755, + "rewards/thk_ans_format_reward": 1.0, + "step": 2665, + "think_completion_length": 42.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.5, + "epoch": 4.502529510961214, + "grad_norm": 8.88138998484568, + "kl": 0.62109375, + "learning_rate": 1.0084317032040472e-07, + "loss": 0.0007, + "reward": 3.5684311389923096, + "reward_std": 0.12402192875742912, + "rewards/final_reward": 1.1883733673027432, + "rewards/mask_iou_reward": 0.5941866836513716, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5684311389923096, + "rewards/thk_ans_format_reward": 1.0, + "step": 2666, + "think_completion_length": 43.5625 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.28125, + "epoch": 4.504215851602024, + "grad_norm": 5.665710067122564, + "kl": 0.537109375, + "learning_rate": 1.0050590219224282e-07, + "loss": 0.0005, + "reward": 3.340713143348694, + "reward_std": 0.08154628158081323, + "rewards/final_reward": 1.9216926105581993, + "rewards/mask_iou_reward": 0.9608463052790996, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3407131731510162, + "rewards/thk_ans_format_reward": 1.0, + "step": 2667, + "think_completion_length": 40.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.546875, + "epoch": 4.505902192242833, + "grad_norm": 13.118503767313488, + "kl": 0.544921875, + "learning_rate": 1.0016863406408094e-07, + "loss": 0.0005, + "reward": 3.4577395915985107, + "reward_std": 0.10068205185234547, + "rewards/final_reward": 1.1926356698317773, + "rewards/mask_iou_reward": 0.5963178349158886, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4577394127845764, + "rewards/thk_ans_format_reward": 1.0, + "step": 2668, + "think_completion_length": 38.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.421875, + "epoch": 4.507588532883642, + "grad_norm": 10.438658465372848, + "kl": 0.5458984375, + "learning_rate": 9.983136593591905e-08, + "loss": 0.0005, + "reward": 3.6971330642700195, + "reward_std": 0.05630340613424778, + "rewards/final_reward": 1.75558580296901, + "rewards/mask_iou_reward": 0.877792901484505, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.69713294506073, + "rewards/thk_ans_format_reward": 1.0, + "step": 2669, + "think_completion_length": 39.84375 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.5, + "epoch": 4.509274873524452, + "grad_norm": 7.884218664723201, + "kl": 0.6171875, + "learning_rate": 9.949409780775717e-08, + "loss": 0.0006, + "reward": 3.3458492755889893, + "reward_std": 0.31212668120861053, + "rewards/final_reward": 0.873962773999293, + "rewards/mask_iou_reward": 0.4369813869996465, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3458492755889893, + "rewards/thk_ans_format_reward": 1.0, + "step": 2670, + "think_completion_length": 42.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.28125, + "epoch": 4.5109612141652615, + "grad_norm": 9.23283295335352, + "kl": 0.611328125, + "learning_rate": 9.915682967959528e-08, + "loss": 0.0006, + "reward": 3.2621822357177734, + "reward_std": 0.4180988222360611, + "rewards/final_reward": 1.569866105542606, + "rewards/mask_iou_reward": 0.784933052771303, + "rewards/sam_format_reward": 0.984375, + "rewards/sam_reward_func_ultra": 1.293432354927063, + "rewards/thk_ans_format_reward": 0.984375, + "step": 2671, + "think_completion_length": 37.65625 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.875, + "epoch": 4.512647554806071, + "grad_norm": 19.30137884206897, + "kl": 0.599609375, + "learning_rate": 9.881956155143338e-08, + "loss": 0.0006, + "reward": 3.455308437347412, + "reward_std": 0.13220026344060898, + "rewards/final_reward": 1.458789095807849, + "rewards/mask_iou_reward": 0.7293945479039246, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4553083777427673, + "rewards/thk_ans_format_reward": 1.0, + "step": 2672, + "think_completion_length": 44.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.078125, + "epoch": 4.51433389544688, + "grad_norm": 8.661303822832371, + "kl": 0.61328125, + "learning_rate": 9.84822934232715e-08, + "loss": 0.0006, + "reward": 2.5912883281707764, + "reward_std": 0.16992703033611178, + "rewards/final_reward": 0.321427037456387, + "rewards/mask_iou_reward": 0.1607135187281935, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5912883952260017, + "rewards/thk_ans_format_reward": 1.0, + "step": 2673, + "think_completion_length": 38.03125 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.390625, + "epoch": 4.51602023608769, + "grad_norm": 9.472047529229577, + "kl": 0.623046875, + "learning_rate": 9.814502529510961e-08, + "loss": 0.0007, + "reward": 3.272798538208008, + "reward_std": 0.05844417680054903, + "rewards/final_reward": 1.0032500490354634, + "rewards/mask_iou_reward": 0.5016250245177317, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2727984189987183, + "rewards/thk_ans_format_reward": 1.0, + "step": 2674, + "think_completion_length": 47.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.4375, + "epoch": 4.517706576728499, + "grad_norm": 7.799498451545803, + "kl": 0.55078125, + "learning_rate": 9.780775716694772e-08, + "loss": 0.0006, + "reward": 3.3780055046081543, + "reward_std": 0.24958141800016165, + "rewards/final_reward": 1.1998570376285973, + "rewards/mask_iou_reward": 0.5999285188142987, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3780055046081543, + "rewards/thk_ans_format_reward": 1.0, + "step": 2675, + "think_completion_length": 47.6875 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.890625, + "epoch": 4.519392917369308, + "grad_norm": 16.83449299493527, + "kl": 0.59375, + "learning_rate": 9.747048903878582e-08, + "loss": 0.0006, + "reward": 3.4879668951034546, + "reward_std": 0.17732420563697815, + "rewards/final_reward": 1.0509339859279632, + "rewards/mask_iou_reward": 0.5254669929639816, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.487967073917389, + "rewards/thk_ans_format_reward": 1.0, + "step": 2676, + "think_completion_length": 45.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.609375, + "epoch": 4.5210792580101185, + "grad_norm": 10.81027860870198, + "kl": 0.59375, + "learning_rate": 9.713322091062394e-08, + "loss": 0.0006, + "reward": 3.491172671318054, + "reward_std": 0.08230869006365538, + "rewards/final_reward": 1.9163851028379348, + "rewards/mask_iou_reward": 0.9581925514189674, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4911725521087646, + "rewards/thk_ans_format_reward": 1.0, + "step": 2677, + "think_completion_length": 43.96875 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.140625, + "epoch": 4.522765598650928, + "grad_norm": 16.38177906884117, + "kl": 0.626953125, + "learning_rate": 9.679595278246205e-08, + "loss": 0.0006, + "reward": 3.313094973564148, + "reward_std": 0.043430982856079936, + "rewards/final_reward": 1.412640101241867, + "rewards/mask_iou_reward": 0.7063200506209335, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3130950927734375, + "rewards/thk_ans_format_reward": 1.0, + "step": 2678, + "think_completion_length": 41.78125 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.71875, + "epoch": 4.524451939291737, + "grad_norm": 55.652422746933475, + "kl": 0.5439453125, + "learning_rate": 9.645868465430016e-08, + "loss": 0.0005, + "reward": 3.000455379486084, + "reward_std": 0.12246969155967236, + "rewards/final_reward": 0.7351268397904934, + "rewards/mask_iou_reward": 0.3675634198952467, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0004554092884064, + "rewards/thk_ans_format_reward": 1.0, + "step": 2679, + "think_completion_length": 42.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.46875, + "epoch": 4.526138279932546, + "grad_norm": 16.0943834709773, + "kl": 0.576171875, + "learning_rate": 9.612141652613827e-08, + "loss": 0.0006, + "reward": 3.597416639328003, + "reward_std": 0.1382383331656456, + "rewards/final_reward": 1.431301174343287, + "rewards/mask_iou_reward": 0.7156505871716435, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5974166989326477, + "rewards/thk_ans_format_reward": 1.0, + "step": 2680, + "think_completion_length": 44.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.796875, + "epoch": 4.527824620573356, + "grad_norm": 46.47438098712813, + "kl": 0.58203125, + "learning_rate": 9.578414839797638e-08, + "loss": 0.0006, + "reward": 3.5468854904174805, + "reward_std": 0.06338476575911045, + "rewards/final_reward": 1.5643748779855864, + "rewards/mask_iou_reward": 0.7821874389927932, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5468855500221252, + "rewards/thk_ans_format_reward": 1.0, + "step": 2681, + "think_completion_length": 47.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.234375, + "epoch": 4.529510961214165, + "grad_norm": 65.56906342241375, + "kl": 0.501953125, + "learning_rate": 9.544688026981449e-08, + "loss": 0.0005, + "reward": 3.229214310646057, + "reward_std": 0.045433159917593, + "rewards/final_reward": 0.9161283390685623, + "rewards/mask_iou_reward": 0.45806416953428114, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.229214370250702, + "rewards/thk_ans_format_reward": 1.0, + "step": 2682, + "think_completion_length": 44.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.8125, + "epoch": 4.5311973018549745, + "grad_norm": 6.621407437686109, + "kl": 0.6328125, + "learning_rate": 9.510961214165261e-08, + "loss": 0.0006, + "reward": 3.581550717353821, + "reward_std": 0.004469448467716575, + "rewards/final_reward": 1.2020244633899486, + "rewards/mask_iou_reward": 0.6010122316949743, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5815508365631104, + "rewards/thk_ans_format_reward": 1.0, + "step": 2683, + "think_completion_length": 42.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.96875, + "epoch": 4.532883642495785, + "grad_norm": 5.598185861981018, + "kl": 0.525390625, + "learning_rate": 9.477234401349072e-08, + "loss": 0.0005, + "reward": 3.36284863948822, + "reward_std": 0.1877284124493599, + "rewards/final_reward": 1.7833434425748234, + "rewards/mask_iou_reward": 0.8916717212874117, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3628486394882202, + "rewards/thk_ans_format_reward": 1.0, + "step": 2684, + "think_completion_length": 39.9375 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.515625, + "epoch": 4.534569983136594, + "grad_norm": 13.05266193699777, + "kl": 0.5517578125, + "learning_rate": 9.443507588532882e-08, + "loss": 0.0006, + "reward": 3.418926954269409, + "reward_std": 0.12092643231153488, + "rewards/final_reward": 1.430031399150564, + "rewards/mask_iou_reward": 0.715015699575282, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.418927013874054, + "rewards/thk_ans_format_reward": 1.0, + "step": 2685, + "think_completion_length": 43.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.09375, + "epoch": 4.536256323777403, + "grad_norm": 9.639271862123636, + "kl": 0.609375, + "learning_rate": 9.409780775716694e-08, + "loss": 0.0006, + "reward": 3.740368962287903, + "reward_std": 0.021685122046619654, + "rewards/final_reward": 1.768809005736757, + "rewards/mask_iou_reward": 0.8844045028683785, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7403690218925476, + "rewards/thk_ans_format_reward": 1.0, + "step": 2686, + "think_completion_length": 45.59375 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.875, + "epoch": 4.537942664418212, + "grad_norm": 8.237652072741113, + "kl": 0.580078125, + "learning_rate": 9.376053962900506e-08, + "loss": 0.0006, + "reward": 3.6791892051696777, + "reward_std": 0.009476853301748633, + "rewards/final_reward": 1.927470946512023, + "rewards/mask_iou_reward": 0.9637354732560115, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.679189383983612, + "rewards/thk_ans_format_reward": 1.0, + "step": 2687, + "think_completion_length": 42.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.9375, + "epoch": 4.539629005059022, + "grad_norm": 8.646826214633027, + "kl": 0.55078125, + "learning_rate": 9.342327150084318e-08, + "loss": 0.0006, + "reward": 3.4965227842330933, + "reward_std": 0.17469704151153564, + "rewards/final_reward": 1.4711453506980399, + "rewards/mask_iou_reward": 0.7355726753490199, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4965226650238037, + "rewards/thk_ans_format_reward": 1.0, + "step": 2688, + "think_completion_length": 43.4375 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.5625, + "epoch": 4.541315345699831, + "grad_norm": 10.657778008562978, + "kl": 0.6015625, + "learning_rate": 9.308600337268128e-08, + "loss": 0.0006, + "reward": 3.8605817556381226, + "reward_std": 0.020185125060379505, + "rewards/final_reward": 1.823982382216586, + "rewards/mask_iou_reward": 0.911991191108293, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8605817556381226, + "rewards/thk_ans_format_reward": 1.0, + "step": 2689, + "think_completion_length": 42.15625 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.859375, + "epoch": 4.543001686340641, + "grad_norm": 9.243340633120326, + "kl": 0.720703125, + "learning_rate": 9.274873524451939e-08, + "loss": 0.0007, + "reward": 3.2710307836532593, + "reward_std": 0.04928067233413458, + "rewards/final_reward": 1.2541776348164744, + "rewards/mask_iou_reward": 0.6270888174082372, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2710306346416473, + "rewards/thk_ans_format_reward": 1.0, + "step": 2690, + "think_completion_length": 45.40625 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.28125, + "epoch": 4.544688026981451, + "grad_norm": 7.80711729689743, + "kl": 0.533203125, + "learning_rate": 9.24114671163575e-08, + "loss": 0.0005, + "reward": 3.895188093185425, + "reward_std": 0.013354545866604894, + "rewards/final_reward": 1.987251964311651, + "rewards/mask_iou_reward": 0.9936259821558255, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8951881527900696, + "rewards/thk_ans_format_reward": 1.0, + "step": 2691, + "think_completion_length": 43.34375 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.578125, + "epoch": 4.54637436762226, + "grad_norm": 46.521774713314045, + "kl": 0.59765625, + "learning_rate": 9.207419898819562e-08, + "loss": 0.0006, + "reward": 3.3578141927719116, + "reward_std": 0.13790087588131428, + "rewards/final_reward": 1.3110319838483524, + "rewards/mask_iou_reward": 0.6555159919241762, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3578141927719116, + "rewards/thk_ans_format_reward": 1.0, + "step": 2692, + "think_completion_length": 42.3125 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.65625, + "epoch": 4.548060708263069, + "grad_norm": 12.210287722551254, + "kl": 0.642578125, + "learning_rate": 9.173693086003373e-08, + "loss": 0.0006, + "reward": 3.5226752758026123, + "reward_std": 0.03340075630694628, + "rewards/final_reward": 1.2770168157985262, + "rewards/mask_iou_reward": 0.6385084078992631, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5226754546165466, + "rewards/thk_ans_format_reward": 1.0, + "step": 2693, + "think_completion_length": 43.21875 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.796875, + "epoch": 4.549747048903878, + "grad_norm": 5.288177156407879, + "kl": 0.623046875, + "learning_rate": 9.139966273187183e-08, + "loss": 0.0006, + "reward": 3.1990665197372437, + "reward_std": 0.0531660639680922, + "rewards/final_reward": 1.2057744653670666, + "rewards/mask_iou_reward": 0.6028872326835333, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.199066400527954, + "rewards/thk_ans_format_reward": 1.0, + "step": 2694, + "think_completion_length": 40.8125 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.734375, + "epoch": 4.551433389544688, + "grad_norm": 17.94271726562434, + "kl": 0.578125, + "learning_rate": 9.106239460370995e-08, + "loss": 0.0006, + "reward": 3.2866199016571045, + "reward_std": 0.06423737155273557, + "rewards/final_reward": 1.3763903356116252, + "rewards/mask_iou_reward": 0.6881951678058126, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2866200506687164, + "rewards/thk_ans_format_reward": 1.0, + "step": 2695, + "think_completion_length": 40.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.4375, + "epoch": 4.5531197301854975, + "grad_norm": 24.753576545821314, + "kl": 0.5546875, + "learning_rate": 9.072512647554806e-08, + "loss": 0.0006, + "reward": 3.3977984189987183, + "reward_std": 0.14045307040214539, + "rewards/final_reward": 1.4333959971150594, + "rewards/mask_iou_reward": 0.7166979985575297, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3977983593940735, + "rewards/thk_ans_format_reward": 1.0, + "step": 2696, + "think_completion_length": 44.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.78125, + "epoch": 4.554806070826307, + "grad_norm": 6.709423049230818, + "kl": 0.521484375, + "learning_rate": 9.038785834738617e-08, + "loss": 0.0005, + "reward": 2.9518043994903564, + "reward_std": 0.07662581279873848, + "rewards/final_reward": 0.9217370347905265, + "rewards/mask_iou_reward": 0.46086851739526324, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9518044590950012, + "rewards/thk_ans_format_reward": 1.0, + "step": 2697, + "think_completion_length": 43.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.078125, + "epoch": 4.556492411467117, + "grad_norm": 36.19834516157221, + "kl": 0.60546875, + "learning_rate": 9.005059021922427e-08, + "loss": 0.0006, + "reward": 3.086367964744568, + "reward_std": 0.08483387529850006, + "rewards/final_reward": 1.4563193932476, + "rewards/mask_iou_reward": 0.7281596966238, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0863679647445679, + "rewards/thk_ans_format_reward": 1.0, + "step": 2698, + "think_completion_length": 46.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.640625, + "epoch": 4.558178752107926, + "grad_norm": 5.445666177713425, + "kl": 0.560546875, + "learning_rate": 8.971332209106239e-08, + "loss": 0.0006, + "reward": 3.633660316467285, + "reward_std": 0.08000330440700054, + "rewards/final_reward": 1.769485983974965, + "rewards/mask_iou_reward": 0.8847429919874825, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6336602568626404, + "rewards/thk_ans_format_reward": 1.0, + "step": 2699, + "think_completion_length": 39.46875 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.875, + "epoch": 4.559865092748735, + "grad_norm": 10.187202355665686, + "kl": 0.626953125, + "learning_rate": 8.93760539629005e-08, + "loss": 0.0005, + "reward": 3.619858741760254, + "reward_std": 0.015684593934565783, + "rewards/final_reward": 1.3947394514966747, + "rewards/mask_iou_reward": 0.6973697257483373, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6198588013648987, + "rewards/thk_ans_format_reward": 1.0, + "step": 2700, + "think_completion_length": 41.34375 + } + ], + "logging_steps": 1.0, + "max_steps": 2965, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}