{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.559865092748735, "eval_steps": 100, "global_step": 2700, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 212.203125, "epoch": 0.0016863406408094434, "grad_norm": 4.541860923771718, "kl": 0.0, "learning_rate": 9.99662731871838e-07, "loss": 0.0, "reward": 1.3358332514762878, "reward_std": 0.7925846576690674, "rewards/final_reward": 0.08555527292116892, "rewards/mask_iou_reward": 0.04277763646058446, "rewards/sam_format_reward": 0.75, "rewards/sam_reward_func_ultra": 0.19520824775099754, "rewards/thk_ans_format_reward": 0.390625, "step": 1, "think_completion_length": 54.375 }, { "clip_ratio": 0.0, "completion_length": 193.484375, "epoch": 0.003372681281618887, "grad_norm": 10.409662000899418, "kl": 0.000614166259765625, "learning_rate": 9.993254637436761e-07, "loss": 0.0, "reward": 1.3770169019699097, "reward_std": 0.7793349027633667, "rewards/final_reward": 0.17952230510610304, "rewards/mask_iou_reward": 0.08976115255305152, "rewards/sam_format_reward": 0.828125, "rewards/sam_reward_func_ultra": 0.17389196157455444, "rewards/thk_ans_format_reward": 0.375, "step": 2, "think_completion_length": 61.1875 }, { "clip_ratio": 0.0, "completion_length": 243.328125, "epoch": 0.00505902192242833, "grad_norm": 7.696018374346988, "kl": 0.0007190704345703125, "learning_rate": 9.989881956155142e-07, "loss": 0.0, "reward": 1.3987104892730713, "reward_std": 0.7126790881156921, "rewards/final_reward": 0.13702087713408737, "rewards/mask_iou_reward": 0.06851043856704368, "rewards/sam_format_reward": 0.796875, "rewards/sam_reward_func_ultra": 0.08621050044894218, "rewards/thk_ans_format_reward": 0.515625, "step": 3, "think_completion_length": 90.90625 }, { "clip_ratio": 0.0, "completion_length": 237.375, "epoch": 0.006745362563237774, "grad_norm": 7.557573164502257, "kl": 0.00098419189453125, "learning_rate": 9.986509274873523e-07, "loss": 0.0, "reward": 1.5465224385261536, "reward_std": 0.8511916399002075, "rewards/final_reward": 0.41226695348936837, "rewards/mask_iou_reward": 0.20613347674468419, "rewards/sam_format_reward": 0.8125, "rewards/sam_reward_func_ultra": 0.3121473789215088, "rewards/thk_ans_format_reward": 0.421875, "step": 4, "think_completion_length": 82.125 }, { "clip_ratio": 0.0, "completion_length": 236.0, "epoch": 0.008431703204047217, "grad_norm": 7.6366763281725065, "kl": 0.001277923583984375, "learning_rate": 9.983136593591906e-07, "loss": 0.0, "reward": 2.1739466190338135, "reward_std": 1.1614585518836975, "rewards/final_reward": 0.6282551614601821, "rewards/mask_iou_reward": 0.31412758073009106, "rewards/sam_format_reward": 0.78125, "rewards/sam_reward_func_ultra": 0.7520716190338135, "rewards/thk_ans_format_reward": 0.640625, "step": 5, "think_completion_length": 100.59375 }, { "clip_ratio": 0.0, "completion_length": 224.140625, "epoch": 0.01011804384485666, "grad_norm": 2.912091121635317, "kl": 0.00293731689453125, "learning_rate": 9.979763912310287e-07, "loss": 0.0, "reward": 1.8722057342529297, "reward_std": 0.7133974432945251, "rewards/final_reward": 0.36722210535603916, "rewards/mask_iou_reward": 0.18361105267801958, "rewards/sam_format_reward": 0.84375, "rewards/sam_reward_func_ultra": 0.2784557491540909, "rewards/thk_ans_format_reward": 0.75, "step": 6, "think_completion_length": 110.5625 }, { "clip_ratio": 0.0, "completion_length": 188.640625, "epoch": 0.011804384485666104, "grad_norm": 5.0167749766290965, "kl": 0.0034637451171875, "learning_rate": 9.976391231028668e-07, "loss": 0.0, "reward": 2.0893322825431824, "reward_std": 0.7956610918045044, "rewards/final_reward": 0.23091329445268022, "rewards/mask_iou_reward": 0.11545664722634011, "rewards/sam_format_reward": 0.859375, "rewards/sam_reward_func_ultra": 0.40183228999376297, "rewards/thk_ans_format_reward": 0.828125, "step": 7, "think_completion_length": 92.6875 }, { "clip_ratio": 0.0, "completion_length": 238.109375, "epoch": 0.013490725126475547, "grad_norm": 4.904823389255345, "kl": 0.0058135986328125, "learning_rate": 9.973018549747049e-07, "loss": 0.0, "reward": 1.9315199851989746, "reward_std": 0.7769245803356171, "rewards/final_reward": 0.32775235227855815, "rewards/mask_iou_reward": 0.16387617613927907, "rewards/sam_format_reward": 0.890625, "rewards/sam_reward_func_ultra": 0.3065200597047806, "rewards/thk_ans_format_reward": 0.734375, "step": 8, "think_completion_length": 100.40625 }, { "clip_ratio": 0.0, "completion_length": 207.265625, "epoch": 0.01517706576728499, "grad_norm": 7.32651281879669, "kl": 0.00531005859375, "learning_rate": 9.96964586846543e-07, "loss": 0.0, "reward": 2.4003329277038574, "reward_std": 0.7638080418109894, "rewards/final_reward": 0.6744501600850431, "rewards/mask_iou_reward": 0.33722508004252155, "rewards/sam_format_reward": 0.9375, "rewards/sam_reward_func_ultra": 0.6659578680992126, "rewards/thk_ans_format_reward": 0.796875, "step": 9, "think_completion_length": 95.65625 }, { "clip_ratio": 0.0, "completion_length": 221.0625, "epoch": 0.016863406408094434, "grad_norm": 2.8369350027224343, "kl": 0.0061187744140625, "learning_rate": 9.96627318718381e-07, "loss": 0.0, "reward": 2.134338617324829, "reward_std": 0.6036363840103149, "rewards/final_reward": 0.19468699022801095, "rewards/mask_iou_reward": 0.09734349511400547, "rewards/sam_format_reward": 0.90625, "rewards/sam_reward_func_ultra": 0.3218386247754097, "rewards/thk_ans_format_reward": 0.90625, "step": 10, "think_completion_length": 138.59375 }, { "clip_ratio": 0.0, "completion_length": 261.484375, "epoch": 0.01854974704890388, "grad_norm": 4.726322830555546, "kl": 0.00331878662109375, "learning_rate": 9.962900505902191e-07, "loss": 0.0, "reward": 2.158900499343872, "reward_std": 0.5658632814884186, "rewards/final_reward": 0.3395835136590783, "rewards/mask_iou_reward": 0.16979175682953915, "rewards/sam_format_reward": 0.921875, "rewards/sam_reward_func_ultra": 0.29952552914619446, "rewards/thk_ans_format_reward": 0.9375, "step": 11, "think_completion_length": 138.25 }, { "clip_ratio": 0.0, "completion_length": 228.8125, "epoch": 0.02023608768971332, "grad_norm": 5.799680733497899, "kl": 0.005096435546875, "learning_rate": 9.959527824620572e-07, "loss": 0.0, "reward": 2.2421607971191406, "reward_std": 0.545092910528183, "rewards/final_reward": 0.28276694921239226, "rewards/mask_iou_reward": 0.14138347460619613, "rewards/sam_format_reward": 0.9375, "rewards/sam_reward_func_ultra": 0.320285826921463, "rewards/thk_ans_format_reward": 0.984375, "step": 12, "think_completion_length": 121.90625 }, { "clip_ratio": 0.0, "completion_length": 214.671875, "epoch": 0.021922428330522766, "grad_norm": 10.411892938228622, "kl": 0.0047149658203125, "learning_rate": 9.956155143338955e-07, "loss": 0.0, "reward": 2.1514652967453003, "reward_std": 0.46626925468444824, "rewards/final_reward": 0.10493260642775627, "rewards/mask_iou_reward": 0.052466303213878136, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 0.2452152743935585, "rewards/thk_ans_format_reward": 0.9375, "step": 13, "think_completion_length": 139.375 }, { "clip_ratio": 0.0, "completion_length": 209.578125, "epoch": 0.023608768971332208, "grad_norm": 3.988672789225242, "kl": 0.0039215087890625, "learning_rate": 9.952782462057336e-07, "loss": 0.0, "reward": 2.428071618080139, "reward_std": 0.4563491642475128, "rewards/final_reward": 0.6773734275020717, "rewards/mask_iou_reward": 0.3386867137510359, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.4749467372894287, "rewards/thk_ans_format_reward": 0.953125, "step": 14, "think_completion_length": 119.1875 }, { "clip_ratio": 0.0, "completion_length": 218.109375, "epoch": 0.025295109612141653, "grad_norm": 4.0623051435163635, "kl": 0.0120391845703125, "learning_rate": 9.949409780775717e-07, "loss": 0.0, "reward": 2.3204623460769653, "reward_std": 0.46338681876659393, "rewards/final_reward": 0.25837267105119793, "rewards/mask_iou_reward": 0.12918633552559897, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.36733730882406235, "rewards/thk_ans_format_reward": 0.96875, "step": 15, "think_completion_length": 110.3125 }, { "clip_ratio": 0.0, "completion_length": 193.546875, "epoch": 0.026981450252951095, "grad_norm": 4.023141614046889, "kl": 0.0052947998046875, "learning_rate": 9.946037099494098e-07, "loss": 0.0, "reward": 2.421836733818054, "reward_std": 0.47443249821662903, "rewards/final_reward": 0.7746925003730644, "rewards/mask_iou_reward": 0.3873462501865322, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.48433683812618256, "rewards/thk_ans_format_reward": 0.953125, "step": 16, "think_completion_length": 89.71875 }, { "clip_ratio": 0.0, "completion_length": 212.015625, "epoch": 0.02866779089376054, "grad_norm": 3.8148204524014933, "kl": 0.00437164306640625, "learning_rate": 9.942664418212479e-07, "loss": 0.0, "reward": 2.22139310836792, "reward_std": 0.3197246938943863, "rewards/final_reward": 0.280598368183643, "rewards/mask_iou_reward": 0.1402991840918215, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 0.2526431120932102, "rewards/thk_ans_format_reward": 1.0, "step": 17, "think_completion_length": 102.34375 }, { "clip_ratio": 0.0, "completion_length": 202.4375, "epoch": 0.03035413153456998, "grad_norm": 6.159701968434453, "kl": 0.0063934326171875, "learning_rate": 9.93929173693086e-07, "loss": 0.0, "reward": 2.3376920223236084, "reward_std": 0.4732118546962738, "rewards/final_reward": 0.41400893951175743, "rewards/mask_iou_reward": 0.20700446975587872, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.35331688821315765, "rewards/thk_ans_format_reward": 1.0, "step": 18, "think_completion_length": 126.84375 }, { "clip_ratio": 0.0, "completion_length": 234.28125, "epoch": 0.03204047217537943, "grad_norm": 7.068604665156501, "kl": 0.00653076171875, "learning_rate": 9.93591905564924e-07, "loss": 0.0, "reward": 2.571447491645813, "reward_std": 0.39502865076065063, "rewards/final_reward": 0.11502420152389195, "rewards/mask_iou_reward": 0.057512100761945975, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.587072491645813, "rewards/thk_ans_format_reward": 1.0, "step": 19, "think_completion_length": 120.75 }, { "clip_ratio": 0.0, "completion_length": 211.09375, "epoch": 0.03372681281618887, "grad_norm": 6.134636624813015, "kl": 0.0063323974609375, "learning_rate": 9.932546374367621e-07, "loss": 0.0, "reward": 2.1835756301879883, "reward_std": 0.2918053865432739, "rewards/final_reward": 0.34969948763247793, "rewards/mask_iou_reward": 0.17484974381623897, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.23045063391327858, "rewards/thk_ans_format_reward": 0.96875, "step": 20, "think_completion_length": 106.65625 }, { "clip_ratio": 0.0, "completion_length": 214.171875, "epoch": 0.03541315345699832, "grad_norm": 5.003282709985643, "kl": 0.0080413818359375, "learning_rate": 9.929173693086002e-07, "loss": 0.0, "reward": 2.1113470792770386, "reward_std": 0.3219098150730133, "rewards/final_reward": 0.2392562255838127, "rewards/mask_iou_reward": 0.11962811279190635, "rewards/sam_format_reward": 0.953125, "rewards/sam_reward_func_ultra": 0.1582220196723938, "rewards/thk_ans_format_reward": 1.0, "step": 21, "think_completion_length": 104.40625 }, { "clip_ratio": 0.0, "completion_length": 195.703125, "epoch": 0.03709949409780776, "grad_norm": 7.420447224924629, "kl": 0.007659912109375, "learning_rate": 9.925801011804385e-07, "loss": 0.0, "reward": 2.3550353050231934, "reward_std": 0.47632284462451935, "rewards/final_reward": 0.7523633924279491, "rewards/mask_iou_reward": 0.37618169621397457, "rewards/sam_format_reward": 0.953125, "rewards/sam_reward_func_ultra": 0.41753529757261276, "rewards/thk_ans_format_reward": 0.984375, "step": 22, "think_completion_length": 111.71875 }, { "clip_ratio": 0.0, "completion_length": 200.515625, "epoch": 0.0387858347386172, "grad_norm": 6.934608630709545, "kl": 0.009063720703125, "learning_rate": 9.922428330522766e-07, "loss": 0.0, "reward": 2.567542314529419, "reward_std": 0.31761983036994934, "rewards/final_reward": 0.7383073887884706, "rewards/mask_iou_reward": 0.3691536943942353, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.5831672549247742, "rewards/thk_ans_format_reward": 1.0, "step": 23, "think_completion_length": 100.75 }, { "clip_ratio": 0.0, "completion_length": 211.3125, "epoch": 0.04047217537942664, "grad_norm": 9.343990713381167, "kl": 0.009674072265625, "learning_rate": 9.919055649241147e-07, "loss": 0.0, "reward": 2.570692539215088, "reward_std": 0.40113507211208344, "rewards/final_reward": 0.06680689264378081, "rewards/mask_iou_reward": 0.033403446321890407, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.5863174498081207, "rewards/thk_ans_format_reward": 1.0, "step": 24, "think_completion_length": 115.375 }, { "clip_ratio": 0.0, "completion_length": 180.546875, "epoch": 0.04215851602023609, "grad_norm": 11.581955402368543, "kl": 0.011810302734375, "learning_rate": 9.915682967959528e-07, "loss": 0.0, "reward": 2.5906275510787964, "reward_std": 0.4756350666284561, "rewards/final_reward": 0.4344880669538129, "rewards/mask_iou_reward": 0.21724403347690646, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5906275063753128, "rewards/thk_ans_format_reward": 1.0, "step": 25, "think_completion_length": 87.8125 }, { "clip_ratio": 0.0, "completion_length": 180.734375, "epoch": 0.04384485666104553, "grad_norm": 3.7116820186026, "kl": 0.0115966796875, "learning_rate": 9.912310286677909e-07, "loss": 0.0, "reward": 2.8216378688812256, "reward_std": 0.6313284933567047, "rewards/final_reward": 0.9093900982781415, "rewards/mask_iou_reward": 0.4546950491390708, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.8528877198696136, "rewards/thk_ans_format_reward": 0.984375, "step": 26, "think_completion_length": 83.84375 }, { "clip_ratio": 0.0, "completion_length": 204.421875, "epoch": 0.045531197301854974, "grad_norm": 9.778107173702582, "kl": 0.0108642578125, "learning_rate": 9.90893760539629e-07, "loss": 0.0, "reward": 2.408462643623352, "reward_std": 0.33236178010702133, "rewards/final_reward": 0.34480383441485996, "rewards/mask_iou_reward": 0.17240191720742998, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.4240875542163849, "rewards/thk_ans_format_reward": 1.0, "step": 27, "think_completion_length": 105.09375 }, { "clip_ratio": 0.0, "completion_length": 213.140625, "epoch": 0.047217537942664416, "grad_norm": 3.439777122996586, "kl": 0.0130615234375, "learning_rate": 9.90556492411467e-07, "loss": 0.0, "reward": 2.4471945762634277, "reward_std": 0.60556361079216, "rewards/final_reward": 0.07983588152777385, "rewards/mask_iou_reward": 0.039917940763886925, "rewards/sam_format_reward": 0.953125, "rewards/sam_reward_func_ultra": 0.5409445911645889, "rewards/thk_ans_format_reward": 0.953125, "step": 28, "think_completion_length": 82.8125 }, { "clip_ratio": 0.0, "completion_length": 192.359375, "epoch": 0.048903878583473864, "grad_norm": 5.727453114810501, "kl": 0.01263427734375, "learning_rate": 9.902192242833051e-07, "loss": 0.0, "reward": 2.2731913328170776, "reward_std": 0.3333955407142639, "rewards/final_reward": 0.17951628004589953, "rewards/mask_iou_reward": 0.08975814002294977, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.3044413551688194, "rewards/thk_ans_format_reward": 0.984375, "step": 29, "think_completion_length": 81.3125 }, { "clip_ratio": 0.0, "completion_length": 193.203125, "epoch": 0.050590219224283306, "grad_norm": 3.5572722758780215, "kl": 0.01251220703125, "learning_rate": 9.898819561551432e-07, "loss": 0.0, "reward": 2.581373453140259, "reward_std": 0.4498617798089981, "rewards/final_reward": 0.4765882465393859, "rewards/mask_iou_reward": 0.23829412326969296, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.5969983041286469, "rewards/thk_ans_format_reward": 1.0, "step": 30, "think_completion_length": 110.875 }, { "clip_ratio": 0.0, "completion_length": 168.953125, "epoch": 0.05227655986509275, "grad_norm": 6.201949505321076, "kl": 0.013153076171875, "learning_rate": 9.895446880269815e-07, "loss": 0.0, "reward": 2.519645571708679, "reward_std": 0.5184344947338104, "rewards/final_reward": 0.20871607362538738, "rewards/mask_iou_reward": 0.10435803681269369, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.5508955717086792, "rewards/thk_ans_format_reward": 0.984375, "step": 31, "think_completion_length": 74.65625 }, { "clip_ratio": 0.0, "completion_length": 181.734375, "epoch": 0.05396290050590219, "grad_norm": 3.1357531664931564, "kl": 0.014801025390625, "learning_rate": 9.892074198988196e-07, "loss": 0.0, "reward": 2.109021544456482, "reward_std": 0.2812964990735054, "rewards/final_reward": 0.17978053567590996, "rewards/mask_iou_reward": 0.08989026783795498, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.12464653328061104, "rewards/thk_ans_format_reward": 1.0, "step": 32, "think_completion_length": 103.6875 }, { "clip_ratio": 0.0, "completion_length": 179.203125, "epoch": 0.05564924114671164, "grad_norm": 7.359110139858053, "kl": 0.013458251953125, "learning_rate": 9.888701517706575e-07, "loss": 0.0, "reward": 3.050374388694763, "reward_std": 0.5476076006889343, "rewards/final_reward": 1.2483584376076524, "rewards/mask_iou_reward": 0.6241792188038262, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.050374448299408, "rewards/thk_ans_format_reward": 1.0, "step": 33, "think_completion_length": 85.71875 }, { "clip_ratio": 0.0, "completion_length": 180.390625, "epoch": 0.05733558178752108, "grad_norm": 3.3492929495520656, "kl": 0.01727294921875, "learning_rate": 9.885328836424958e-07, "loss": 0.0, "reward": 2.332158923149109, "reward_std": 0.48627421259880066, "rewards/final_reward": 0.2444915259100053, "rewards/mask_iou_reward": 0.12224576295500265, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.3477838523685932, "rewards/thk_ans_format_reward": 0.984375, "step": 34, "think_completion_length": 92.5 }, { "clip_ratio": 0.0, "completion_length": 187.859375, "epoch": 0.05902192242833052, "grad_norm": 5.274814211888088, "kl": 0.01641845703125, "learning_rate": 9.881956155143339e-07, "loss": 0.0, "reward": 2.788159489631653, "reward_std": 0.47747406363487244, "rewards/final_reward": 0.6692964275570692, "rewards/mask_iou_reward": 0.3346482137785346, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7881594300270081, "rewards/thk_ans_format_reward": 1.0, "step": 35, "think_completion_length": 101.5 }, { "clip_ratio": 0.0, "completion_length": 178.4375, "epoch": 0.06070826306913996, "grad_norm": 3.8878235168317232, "kl": 0.01806640625, "learning_rate": 9.87858347386172e-07, "loss": 0.0, "reward": 2.483630895614624, "reward_std": 0.35369937121868134, "rewards/final_reward": 0.6384590901960252, "rewards/mask_iou_reward": 0.3192295450980126, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.49925583600997925, "rewards/thk_ans_format_reward": 0.984375, "step": 36, "think_completion_length": 94.71875 }, { "clip_ratio": 0.0, "completion_length": 172.515625, "epoch": 0.06239460370994941, "grad_norm": 4.710577774961891, "kl": 0.018310546875, "learning_rate": 9.8752107925801e-07, "loss": 0.0, "reward": 2.351213812828064, "reward_std": 0.43042635917663574, "rewards/final_reward": 0.5359964884167394, "rewards/mask_iou_reward": 0.2679982442083697, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 0.3980888221412897, "rewards/thk_ans_format_reward": 0.984375, "step": 37, "think_completion_length": 91.34375 }, { "clip_ratio": 0.0, "completion_length": 186.640625, "epoch": 0.06408094435075885, "grad_norm": 6.566714388018845, "kl": 0.019775390625, "learning_rate": 9.871838111298481e-07, "loss": 0.0, "reward": 2.7930028438568115, "reward_std": 0.4626040458679199, "rewards/final_reward": 1.280001967353977, "rewards/mask_iou_reward": 0.6400009836769885, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7930029332637787, "rewards/thk_ans_format_reward": 1.0, "step": 38, "think_completion_length": 114.4375 }, { "clip_ratio": 0.0, "completion_length": 154.03125, "epoch": 0.0657672849915683, "grad_norm": 4.749445484773544, "kl": 0.0218505859375, "learning_rate": 9.868465430016864e-07, "loss": 0.0, "reward": 2.3910492658615112, "reward_std": 0.4347519278526306, "rewards/final_reward": 0.5976855387780159, "rewards/mask_iou_reward": 0.29884276938900795, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.40667423605918884, "rewards/thk_ans_format_reward": 1.0, "step": 39, "think_completion_length": 61.8125 }, { "clip_ratio": 0.0, "completion_length": 161.03125, "epoch": 0.06745362563237774, "grad_norm": 6.983534877629888, "kl": 0.0255126953125, "learning_rate": 9.865092748735245e-07, "loss": 0.0, "reward": 2.32186222076416, "reward_std": 0.32901330292224884, "rewards/final_reward": 0.27268833919659746, "rewards/mask_iou_reward": 0.13634416959829873, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.32186219096183777, "rewards/thk_ans_format_reward": 1.0, "step": 40, "think_completion_length": 74.375 }, { "clip_ratio": 0.0, "completion_length": 167.0625, "epoch": 0.06913996627318718, "grad_norm": 2.6419405467908934, "kl": 0.0234375, "learning_rate": 9.861720067453626e-07, "loss": 0.0, "reward": 2.150454521179199, "reward_std": 0.35950616002082825, "rewards/final_reward": 0.18317714975195007, "rewards/mask_iou_reward": 0.09158857487597503, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.18170449137687683, "rewards/thk_ans_format_reward": 0.984375, "step": 41, "think_completion_length": 64.28125 }, { "clip_ratio": 0.0, "completion_length": 174.609375, "epoch": 0.07082630691399663, "grad_norm": 4.05208078269746, "kl": 0.0238037109375, "learning_rate": 9.858347386172007e-07, "loss": 0.0, "reward": 2.6930134296417236, "reward_std": 0.4153265655040741, "rewards/final_reward": 0.7187156032058426, "rewards/mask_iou_reward": 0.3593578016029213, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6930133700370789, "rewards/thk_ans_format_reward": 1.0, "step": 42, "think_completion_length": 87.25 }, { "clip_ratio": 0.0, "completion_length": 154.265625, "epoch": 0.07251264755480608, "grad_norm": 8.775407716744624, "kl": 0.0306396484375, "learning_rate": 9.854974704890388e-07, "loss": 0.0, "reward": 2.2557884454727173, "reward_std": 0.2892530858516693, "rewards/final_reward": 0.29538722660033423, "rewards/mask_iou_reward": 0.14769361330016711, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.25578849017620087, "rewards/thk_ans_format_reward": 1.0, "step": 43, "think_completion_length": 64.3125 }, { "clip_ratio": 0.0, "completion_length": 166.234375, "epoch": 0.07419898819561552, "grad_norm": 4.092938456973902, "kl": 0.025390625, "learning_rate": 9.851602023608769e-07, "loss": 0.0, "reward": 2.231912851333618, "reward_std": 0.25391124188899994, "rewards/final_reward": 0.1966431877886882, "rewards/mask_iou_reward": 0.0983215938943441, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.23191292583942413, "rewards/thk_ans_format_reward": 1.0, "step": 44, "think_completion_length": 76.375 }, { "clip_ratio": 0.0, "completion_length": 165.8125, "epoch": 0.07588532883642496, "grad_norm": 3.9861339883345415, "kl": 0.02789306640625, "learning_rate": 9.84822934232715e-07, "loss": 0.0, "reward": 2.4915008544921875, "reward_std": 0.3098950535058975, "rewards/final_reward": 0.48250574585781697, "rewards/mask_iou_reward": 0.24125287292890849, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.4915008842945099, "rewards/thk_ans_format_reward": 1.0, "step": 45, "think_completion_length": 81.53125 }, { "clip_ratio": 0.0, "completion_length": 162.671875, "epoch": 0.0775716694772344, "grad_norm": 5.193780871736565, "kl": 0.02783203125, "learning_rate": 9.84485666104553e-07, "loss": 0.0, "reward": 2.3561939001083374, "reward_std": 0.48239606618881226, "rewards/final_reward": 0.3691018382312205, "rewards/mask_iou_reward": 0.18455091911561025, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.356193870306015, "rewards/thk_ans_format_reward": 1.0, "step": 46, "think_completion_length": 71.0625 }, { "clip_ratio": 0.0, "completion_length": 157.265625, "epoch": 0.07925801011804384, "grad_norm": 15.234317760475385, "kl": 0.0238037109375, "learning_rate": 9.841483979763911e-07, "loss": 0.0, "reward": 2.8241143226623535, "reward_std": 0.5089195966720581, "rewards/final_reward": 0.7781608077357804, "rewards/mask_iou_reward": 0.3890804038678902, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.824114203453064, "rewards/thk_ans_format_reward": 1.0, "step": 47, "think_completion_length": 77.125 }, { "clip_ratio": 0.0, "completion_length": 170.671875, "epoch": 0.08094435075885328, "grad_norm": 5.321848456796247, "kl": 0.02398681640625, "learning_rate": 9.838111298482294e-07, "loss": 0.0, "reward": 2.518641948699951, "reward_std": 0.6300854980945587, "rewards/final_reward": 0.25648786787986366, "rewards/mask_iou_reward": 0.12824393393993183, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.549891784787178, "rewards/thk_ans_format_reward": 0.984375, "step": 48, "think_completion_length": 96.84375 }, { "clip_ratio": 0.0, "completion_length": 157.8125, "epoch": 0.08263069139966273, "grad_norm": 4.2051402144970185, "kl": 0.0277099609375, "learning_rate": 9.834738617200675e-07, "loss": 0.0, "reward": 2.3136563301086426, "reward_std": 0.33684292435646057, "rewards/final_reward": 0.25782320440538586, "rewards/mask_iou_reward": 0.12891160220269293, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.31365638226270676, "rewards/thk_ans_format_reward": 1.0, "step": 49, "think_completion_length": 80.1875 }, { "clip_ratio": 0.0, "completion_length": 178.09375, "epoch": 0.08431703204047218, "grad_norm": 3.5448275921745855, "kl": 0.03167724609375, "learning_rate": 9.831365935919054e-07, "loss": 0.0, "reward": 2.624325752258301, "reward_std": 0.5980704128742218, "rewards/final_reward": 0.7610812306965711, "rewards/mask_iou_reward": 0.38054061534828554, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 0.6712007820606232, "rewards/thk_ans_format_reward": 0.984375, "step": 50, "think_completion_length": 94.75 }, { "clip_ratio": 0.0, "completion_length": 172.625, "epoch": 0.08600337268128162, "grad_norm": 13.710511715854196, "kl": 0.02752685546875, "learning_rate": 9.827993254637437e-07, "loss": 0.0, "reward": 2.9260172843933105, "reward_std": 0.5077286660671234, "rewards/final_reward": 0.6013949454238717, "rewards/mask_iou_reward": 0.30069747271193586, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.926017165184021, "rewards/thk_ans_format_reward": 1.0, "step": 51, "think_completion_length": 89.0 }, { "clip_ratio": 0.0, "completion_length": 155.140625, "epoch": 0.08768971332209106, "grad_norm": 5.965861687588688, "kl": 0.0281982421875, "learning_rate": 9.824620573355818e-07, "loss": 0.0, "reward": 2.553568720817566, "reward_std": 0.39463698863983154, "rewards/final_reward": 0.7572741895663272, "rewards/mask_iou_reward": 0.3786370947831636, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 0.5848187357187271, "rewards/thk_ans_format_reward": 1.0, "step": 52, "think_completion_length": 80.5625 }, { "clip_ratio": 0.0, "completion_length": 168.265625, "epoch": 0.0893760539629005, "grad_norm": 5.2272156753878685, "kl": 0.037109375, "learning_rate": 9.821247892074199e-07, "loss": 0.0, "reward": 2.8453149795532227, "reward_std": 0.3831482380628586, "rewards/final_reward": 1.032414764875731, "rewards/mask_iou_reward": 0.5162073824378655, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.8609398603439331, "rewards/thk_ans_format_reward": 1.0, "step": 53, "think_completion_length": 82.78125 }, { "clip_ratio": 0.0, "completion_length": 155.96875, "epoch": 0.09106239460370995, "grad_norm": 18.25648493841559, "kl": 0.041748046875, "learning_rate": 9.81787521079258e-07, "loss": 0.0, "reward": 2.4916036128997803, "reward_std": 0.5656653642654419, "rewards/final_reward": 0.4259493768200442, "rewards/mask_iou_reward": 0.2129746884100221, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.49160344898700714, "rewards/thk_ans_format_reward": 1.0, "step": 54, "think_completion_length": 76.125 }, { "clip_ratio": 0.0, "completion_length": 184.0, "epoch": 0.09274873524451939, "grad_norm": 4.309489924245263, "kl": 0.0341796875, "learning_rate": 9.81450252951096e-07, "loss": 0.0, "reward": 2.443922281265259, "reward_std": 0.398771733045578, "rewards/final_reward": 0.3771699374711056, "rewards/mask_iou_reward": 0.1885849687355528, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.4595472365617752, "rewards/thk_ans_format_reward": 1.0, "step": 55, "think_completion_length": 96.40625 }, { "clip_ratio": 0.0, "completion_length": 157.0625, "epoch": 0.09443507588532883, "grad_norm": 6.056181466884099, "kl": 0.047119140625, "learning_rate": 9.811129848229341e-07, "loss": 0.0001, "reward": 2.8348472118377686, "reward_std": 0.48723451793193817, "rewards/final_reward": 0.5843060876407499, "rewards/mask_iou_reward": 0.29215304382037494, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8348471522331238, "rewards/thk_ans_format_reward": 1.0, "step": 56, "think_completion_length": 79.65625 }, { "clip_ratio": 0.0, "completion_length": 153.890625, "epoch": 0.09612141652613827, "grad_norm": 4.78709575873903, "kl": 0.031494140625, "learning_rate": 9.807757166947724e-07, "loss": 0.0, "reward": 2.881469249725342, "reward_std": 0.2315191924571991, "rewards/final_reward": 0.8960268816351212, "rewards/mask_iou_reward": 0.4480134408175606, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.881469264626503, "rewards/thk_ans_format_reward": 1.0, "step": 57, "think_completion_length": 97.0625 }, { "clip_ratio": 0.0, "completion_length": 154.9375, "epoch": 0.09780775716694773, "grad_norm": 4.489917300832141, "kl": 0.041015625, "learning_rate": 9.804384485666103e-07, "loss": 0.0, "reward": 2.621418833732605, "reward_std": 0.3816119581460953, "rewards/final_reward": 0.3244678704006397, "rewards/mask_iou_reward": 0.16223393520031984, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6370438188314438, "rewards/thk_ans_format_reward": 0.984375, "step": 58, "think_completion_length": 77.4375 }, { "clip_ratio": 0.0, "completion_length": 152.421875, "epoch": 0.09949409780775717, "grad_norm": 30.73746270981293, "kl": 0.9716796875, "learning_rate": 9.801011804384484e-07, "loss": 0.001, "reward": 2.356270670890808, "reward_std": 0.32067833840847015, "rewards/final_reward": 0.23782315432838275, "rewards/mask_iou_reward": 0.11891157716419137, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.3875206280499697, "rewards/thk_ans_format_reward": 0.984375, "step": 59, "think_completion_length": 81.96875 }, { "clip_ratio": 0.0, "completion_length": 159.859375, "epoch": 0.10118043844856661, "grad_norm": 7.300470439382993, "kl": 0.0509033203125, "learning_rate": 9.797639123102867e-07, "loss": 0.0001, "reward": 2.279364824295044, "reward_std": 0.3495059013366699, "rewards/final_reward": 0.3413407541577943, "rewards/mask_iou_reward": 0.17067037707889715, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.2793649360537529, "rewards/thk_ans_format_reward": 1.0, "step": 60, "think_completion_length": 82.90625 }, { "clip_ratio": 0.0, "completion_length": 155.375, "epoch": 0.10286677908937605, "grad_norm": 3.6231139230653238, "kl": 0.048095703125, "learning_rate": 9.794266441821248e-07, "loss": 0.0, "reward": 2.9439969062805176, "reward_std": 0.44367513060569763, "rewards/final_reward": 1.0699706439093257, "rewards/mask_iou_reward": 0.5349853219546629, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9439970254898071, "rewards/thk_ans_format_reward": 1.0, "step": 61, "think_completion_length": 74.125 }, { "clip_ratio": 0.0, "completion_length": 152.140625, "epoch": 0.1045531197301855, "grad_norm": 4.731110310007667, "kl": 0.041748046875, "learning_rate": 9.790893760539629e-07, "loss": 0.0, "reward": 2.3657991886138916, "reward_std": 0.3336651027202606, "rewards/final_reward": 0.23778256411692913, "rewards/mask_iou_reward": 0.11889128205846457, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.38142427057027817, "rewards/thk_ans_format_reward": 1.0, "step": 62, "think_completion_length": 69.0 }, { "clip_ratio": 0.0, "completion_length": 138.734375, "epoch": 0.10623946037099494, "grad_norm": 4.332128989884373, "kl": 0.045166015625, "learning_rate": 9.78752107925801e-07, "loss": 0.0, "reward": 2.584197163581848, "reward_std": 0.36240070313215256, "rewards/final_reward": 1.0086378407320715, "rewards/mask_iou_reward": 0.5043189203660358, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5841971933841705, "rewards/thk_ans_format_reward": 1.0, "step": 63, "think_completion_length": 66.125 }, { "clip_ratio": 0.0, "completion_length": 134.375, "epoch": 0.10792580101180438, "grad_norm": 6.430450000313298, "kl": 0.0435791015625, "learning_rate": 9.78414839797639e-07, "loss": 0.0, "reward": 2.966284394264221, "reward_std": 0.44423648715019226, "rewards/final_reward": 0.9652030991016378, "rewards/mask_iou_reward": 0.4826015495508189, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9662843346595764, "rewards/thk_ans_format_reward": 1.0, "step": 64, "think_completion_length": 70.8125 }, { "clip_ratio": 0.0, "completion_length": 138.75, "epoch": 0.10961214165261383, "grad_norm": 6.607035457021729, "kl": 0.0462646484375, "learning_rate": 9.780775716694773e-07, "loss": 0.0, "reward": 2.5945472717285156, "reward_std": 0.43930642306804657, "rewards/final_reward": 0.5589171176406154, "rewards/mask_iou_reward": 0.2794585588203077, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5945473164319992, "rewards/thk_ans_format_reward": 1.0, "step": 65, "think_completion_length": 68.15625 }, { "clip_ratio": 0.0, "completion_length": 157.09375, "epoch": 0.11129848229342328, "grad_norm": 7.237969962856227, "kl": 0.0462646484375, "learning_rate": 9.777403035413154e-07, "loss": 0.0, "reward": 2.5197932720184326, "reward_std": 0.361030712723732, "rewards/final_reward": 0.8137916503818017, "rewards/mask_iou_reward": 0.40689582519090084, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5197932720184326, "rewards/thk_ans_format_reward": 1.0, "step": 66, "think_completion_length": 78.6875 }, { "clip_ratio": 0.0, "completion_length": 162.984375, "epoch": 0.11298482293423272, "grad_norm": 3.667449916530634, "kl": 0.052734375, "learning_rate": 9.774030354131533e-07, "loss": 0.0001, "reward": 2.8902995586395264, "reward_std": 0.3441592901945114, "rewards/final_reward": 0.7054827168834719, "rewards/mask_iou_reward": 0.35274135844173593, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8902994990348816, "rewards/thk_ans_format_reward": 1.0, "step": 67, "think_completion_length": 84.46875 }, { "clip_ratio": 0.0, "completion_length": 148.078125, "epoch": 0.11467116357504216, "grad_norm": 4.636281295895278, "kl": 0.05419921875, "learning_rate": 9.770657672849916e-07, "loss": 0.0001, "reward": 3.043671727180481, "reward_std": 0.4341175705194473, "rewards/final_reward": 0.8562785388668822, "rewards/mask_iou_reward": 0.4281392694334411, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0436716675758362, "rewards/thk_ans_format_reward": 1.0, "step": 68, "think_completion_length": 63.09375 }, { "clip_ratio": 0.0, "completion_length": 156.890625, "epoch": 0.1163575042158516, "grad_norm": 6.675883431911974, "kl": 0.05029296875, "learning_rate": 9.767284991568297e-07, "loss": 0.0001, "reward": 2.569810628890991, "reward_std": 0.43355831503868103, "rewards/final_reward": 0.9403461534095212, "rewards/mask_iou_reward": 0.4701730767047606, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5698107182979584, "rewards/thk_ans_format_reward": 1.0, "step": 69, "think_completion_length": 81.71875 }, { "clip_ratio": 0.0, "completion_length": 134.046875, "epoch": 0.11804384485666104, "grad_norm": 3.7985544472742885, "kl": 0.05224609375, "learning_rate": 9.763912310286678e-07, "loss": 0.0, "reward": 2.5393481254577637, "reward_std": 0.37574321031570435, "rewards/final_reward": 0.5661758463546982, "rewards/mask_iou_reward": 0.2830879231773491, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5393481552600861, "rewards/thk_ans_format_reward": 1.0, "step": 70, "think_completion_length": 60.34375 }, { "clip_ratio": 0.0, "completion_length": 136.03125, "epoch": 0.11973018549747048, "grad_norm": 9.509296755104442, "kl": 0.050048828125, "learning_rate": 9.760539629005059e-07, "loss": 0.0, "reward": 2.6844829320907593, "reward_std": 0.25827430188655853, "rewards/final_reward": 0.9556732039750488, "rewards/mask_iou_reward": 0.4778366019875244, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6844830363988876, "rewards/thk_ans_format_reward": 1.0, "step": 71, "think_completion_length": 69.21875 }, { "clip_ratio": 0.0, "completion_length": 137.8125, "epoch": 0.12141652613827993, "grad_norm": 6.190314653353028, "kl": 0.050048828125, "learning_rate": 9.75716694772344e-07, "loss": 0.0001, "reward": 2.965428590774536, "reward_std": 0.4965183287858963, "rewards/final_reward": 1.020839803331826, "rewards/mask_iou_reward": 0.510419901665913, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9654284715652466, "rewards/thk_ans_format_reward": 1.0, "step": 72, "think_completion_length": 73.125 }, { "clip_ratio": 0.0, "completion_length": 142.375, "epoch": 0.12310286677908938, "grad_norm": 3.917708434827429, "kl": 0.055908203125, "learning_rate": 9.75379426644182e-07, "loss": 0.0001, "reward": 2.393458843231201, "reward_std": 0.44205300509929657, "rewards/final_reward": 0.330337392985688, "rewards/mask_iou_reward": 0.165168696492844, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.40908390283584595, "rewards/thk_ans_format_reward": 1.0, "step": 73, "think_completion_length": 73.96875 }, { "clip_ratio": 0.0, "completion_length": 143.640625, "epoch": 0.12478920741989882, "grad_norm": 18.101921577359803, "kl": 0.0517578125, "learning_rate": 9.750421585160203e-07, "loss": 0.0001, "reward": 2.5843217372894287, "reward_std": 0.3954938128590584, "rewards/final_reward": 0.5663095013474037, "rewards/mask_iou_reward": 0.2831547506737018, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5843217074871063, "rewards/thk_ans_format_reward": 1.0, "step": 74, "think_completion_length": 61.21875 }, { "clip_ratio": 0.0, "completion_length": 136.75, "epoch": 0.12647554806070826, "grad_norm": 4.985492957391431, "kl": 0.0654296875, "learning_rate": 9.747048903878582e-07, "loss": 0.0001, "reward": 2.6875481605529785, "reward_std": 0.3493267670273781, "rewards/final_reward": 0.8727965459469798, "rewards/mask_iou_reward": 0.4363982729734899, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6875482201576233, "rewards/thk_ans_format_reward": 1.0, "step": 75, "think_completion_length": 63.21875 }, { "clip_ratio": 0.0, "completion_length": 142.203125, "epoch": 0.1281618887015177, "grad_norm": 4.3007506207368875, "kl": 0.0550537109375, "learning_rate": 9.743676222596963e-07, "loss": 0.0001, "reward": 2.5128692388534546, "reward_std": 0.2817462384700775, "rewards/final_reward": 0.7190053530698697, "rewards/mask_iou_reward": 0.35950267653493484, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5128692984580994, "rewards/thk_ans_format_reward": 1.0, "step": 76, "think_completion_length": 65.53125 }, { "clip_ratio": 0.0, "completion_length": 144.0, "epoch": 0.12984822934232715, "grad_norm": 4.301075928648367, "kl": 0.0494384765625, "learning_rate": 9.740303541315346e-07, "loss": 0.0, "reward": 2.524822235107422, "reward_std": 0.5233335793018341, "rewards/final_reward": 0.18636864728108127, "rewards/mask_iou_reward": 0.09318432364054063, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 0.5560723841190338, "rewards/thk_ans_format_reward": 1.0, "step": 77, "think_completion_length": 66.5625 }, { "clip_ratio": 0.0, "completion_length": 142.28125, "epoch": 0.1315345699831366, "grad_norm": 8.000478470083442, "kl": 0.0528564453125, "learning_rate": 9.736930860033727e-07, "loss": 0.0001, "reward": 2.4821194410324097, "reward_std": 0.36987268924713135, "rewards/final_reward": 0.38024887171918265, "rewards/mask_iou_reward": 0.19012443585959132, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.4821194261312485, "rewards/thk_ans_format_reward": 1.0, "step": 78, "think_completion_length": 78.8125 }, { "clip_ratio": 0.0, "completion_length": 146.84375, "epoch": 0.13322091062394603, "grad_norm": 13.794375657986603, "kl": 0.0489501953125, "learning_rate": 9.733558178752108e-07, "loss": 0.0, "reward": 2.5945109128952026, "reward_std": 0.49958792328834534, "rewards/final_reward": 0.562274788805415, "rewards/mask_iou_reward": 0.2811373944027075, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5945108532905579, "rewards/thk_ans_format_reward": 1.0, "step": 79, "think_completion_length": 70.5 }, { "clip_ratio": 0.0, "completion_length": 146.40625, "epoch": 0.13490725126475547, "grad_norm": 9.463013735013696, "kl": 0.052734375, "learning_rate": 9.730185497470489e-07, "loss": 0.0001, "reward": 3.1495691537857056, "reward_std": 0.3907051086425781, "rewards/final_reward": 0.943087603650786, "rewards/mask_iou_reward": 0.471543801825393, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1495689749717712, "rewards/thk_ans_format_reward": 1.0, "step": 80, "think_completion_length": 65.53125 }, { "clip_ratio": 0.0, "completion_length": 145.375, "epoch": 0.13659359190556492, "grad_norm": 7.983156067058834, "kl": 0.0528564453125, "learning_rate": 9.72681281618887e-07, "loss": 0.0001, "reward": 2.804241895675659, "reward_std": 0.5239145308732986, "rewards/final_reward": 0.5999701674591078, "rewards/mask_iou_reward": 0.2999850837295539, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8042419850826263, "rewards/thk_ans_format_reward": 1.0, "step": 81, "think_completion_length": 70.4375 }, { "clip_ratio": 0.0, "completion_length": 133.203125, "epoch": 0.13827993254637436, "grad_norm": 4.445323229695713, "kl": 0.074951171875, "learning_rate": 9.72344013490725e-07, "loss": 0.0001, "reward": 3.071893572807312, "reward_std": 0.44676540791988373, "rewards/final_reward": 1.342211603129101, "rewards/mask_iou_reward": 0.6711058015645505, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.071893572807312, "rewards/thk_ans_format_reward": 1.0, "step": 82, "think_completion_length": 69.875 }, { "clip_ratio": 0.0, "completion_length": 168.46875, "epoch": 0.1399662731871838, "grad_norm": 14.40443893510557, "kl": 0.0650634765625, "learning_rate": 9.720067453625631e-07, "loss": 0.0001, "reward": 2.468700408935547, "reward_std": 0.41579216718673706, "rewards/final_reward": 0.4535352431406775, "rewards/mask_iou_reward": 0.22676762157033875, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.4687004014849663, "rewards/thk_ans_format_reward": 1.0, "step": 83, "think_completion_length": 79.0 }, { "clip_ratio": 0.0, "completion_length": 150.546875, "epoch": 0.14165261382799327, "grad_norm": 4.915086814801756, "kl": 0.060791015625, "learning_rate": 9.716694772344012e-07, "loss": 0.0001, "reward": 2.4039013385772705, "reward_std": 0.25321827083826065, "rewards/final_reward": 0.24754253484511543, "rewards/mask_iou_reward": 0.12377126742255772, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.40390123426914215, "rewards/thk_ans_format_reward": 1.0, "step": 84, "think_completion_length": 64.8125 }, { "clip_ratio": 0.0, "completion_length": 126.421875, "epoch": 0.1433389544688027, "grad_norm": 5.806750656488714, "kl": 0.060302734375, "learning_rate": 9.713322091062393e-07, "loss": 0.0001, "reward": 3.2514740228652954, "reward_std": 0.42037880420684814, "rewards/final_reward": 1.3206563347399518, "rewards/mask_iou_reward": 0.6603281673699759, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2514739036560059, "rewards/thk_ans_format_reward": 1.0, "step": 85, "think_completion_length": 57.25 }, { "clip_ratio": 0.0, "completion_length": 134.8125, "epoch": 0.14502529510961215, "grad_norm": 12.274112515128795, "kl": 0.0693359375, "learning_rate": 9.709949409780776e-07, "loss": 0.0001, "reward": 2.481606125831604, "reward_std": 0.29363201558589935, "rewards/final_reward": 0.30051989067811424, "rewards/mask_iou_reward": 0.15025994533905712, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.4816061407327652, "rewards/thk_ans_format_reward": 1.0, "step": 86, "think_completion_length": 64.5625 }, { "clip_ratio": 0.0, "completion_length": 146.65625, "epoch": 0.1467116357504216, "grad_norm": 3.7263438785125964, "kl": 0.052490234375, "learning_rate": 9.706576728499157e-07, "loss": 0.0001, "reward": 2.79649555683136, "reward_std": 0.3482329323887825, "rewards/final_reward": 1.0713907295997764, "rewards/mask_iou_reward": 0.5356953647998882, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8121205568313599, "rewards/thk_ans_format_reward": 0.984375, "step": 87, "think_completion_length": 57.78125 }, { "clip_ratio": 0.0, "completion_length": 132.375, "epoch": 0.14839797639123103, "grad_norm": 142.75268382653746, "kl": 0.061767578125, "learning_rate": 9.703204047217538e-07, "loss": 0.0001, "reward": 3.3002171516418457, "reward_std": 0.3195580244064331, "rewards/final_reward": 0.9610996100313889, "rewards/mask_iou_reward": 0.48054980501569444, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3002171516418457, "rewards/thk_ans_format_reward": 1.0, "step": 88, "think_completion_length": 63.6875 }, { "clip_ratio": 0.0, "completion_length": 131.453125, "epoch": 0.15008431703204048, "grad_norm": 6.2755111526202905, "kl": 0.08056640625, "learning_rate": 9.699831365935918e-07, "loss": 0.0001, "reward": 2.781501293182373, "reward_std": 0.49102330207824707, "rewards/final_reward": 0.7359130373387389, "rewards/mask_iou_reward": 0.36795651866936946, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.781501293182373, "rewards/thk_ans_format_reward": 1.0, "step": 89, "think_completion_length": 61.8125 }, { "clip_ratio": 0.0, "completion_length": 143.203125, "epoch": 0.15177065767284992, "grad_norm": 4.883317094191715, "kl": 0.06640625, "learning_rate": 9.6964586846543e-07, "loss": 0.0001, "reward": 2.446820020675659, "reward_std": 0.4249372184276581, "rewards/final_reward": 0.5249717239067151, "rewards/mask_iou_reward": 0.26248586195335755, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.44682009518146515, "rewards/thk_ans_format_reward": 1.0, "step": 90, "think_completion_length": 70.28125 }, { "clip_ratio": 0.0, "completion_length": 132.09375, "epoch": 0.15345699831365936, "grad_norm": 4.402591818341303, "kl": 0.0546875, "learning_rate": 9.693086003372682e-07, "loss": 0.0001, "reward": 2.751936435699463, "reward_std": 0.2803105264902115, "rewards/final_reward": 0.7289582917623812, "rewards/mask_iou_reward": 0.3644791458811906, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7519364431500435, "rewards/thk_ans_format_reward": 1.0, "step": 91, "think_completion_length": 68.21875 }, { "clip_ratio": 0.0, "completion_length": 151.421875, "epoch": 0.1551433389544688, "grad_norm": 4.023932805501269, "kl": 0.060791015625, "learning_rate": 9.689713322091061e-07, "loss": 0.0001, "reward": 2.698573350906372, "reward_std": 0.5618922114372253, "rewards/final_reward": 0.8791768168432108, "rewards/mask_iou_reward": 0.4395884084216054, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6985732316970825, "rewards/thk_ans_format_reward": 1.0, "step": 92, "think_completion_length": 72.625 }, { "clip_ratio": 0.0, "completion_length": 142.015625, "epoch": 0.15682967959527824, "grad_norm": 4.913876907058581, "kl": 0.076171875, "learning_rate": 9.686340640809442e-07, "loss": 0.0001, "reward": 2.4915287494659424, "reward_std": 0.267090268433094, "rewards/final_reward": 0.8385419956098626, "rewards/mask_iou_reward": 0.4192709978049313, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.4915286898612976, "rewards/thk_ans_format_reward": 1.0, "step": 93, "think_completion_length": 68.40625 }, { "clip_ratio": 0.0, "completion_length": 143.03125, "epoch": 0.15851602023608768, "grad_norm": 4.936450902115474, "kl": 0.06591796875, "learning_rate": 9.682967959527825e-07, "loss": 0.0001, "reward": 2.5161983966827393, "reward_std": 0.43957073986530304, "rewards/final_reward": 0.8536540262243562, "rewards/mask_iou_reward": 0.4268270131121781, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5161983147263527, "rewards/thk_ans_format_reward": 1.0, "step": 94, "think_completion_length": 69.5625 }, { "clip_ratio": 0.0, "completion_length": 159.734375, "epoch": 0.16020236087689713, "grad_norm": 10.081914568218277, "kl": 0.0517578125, "learning_rate": 9.679595278246206e-07, "loss": 0.0001, "reward": 2.6539634466171265, "reward_std": 0.3619793802499771, "rewards/final_reward": 0.5307201105892436, "rewards/mask_iou_reward": 0.2653600552946218, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6539634168148041, "rewards/thk_ans_format_reward": 1.0, "step": 95, "think_completion_length": 78.8125 }, { "clip_ratio": 0.0, "completion_length": 135.796875, "epoch": 0.16188870151770657, "grad_norm": 3.9564086843292694, "kl": 0.0625, "learning_rate": 9.676222596964587e-07, "loss": 0.0001, "reward": 2.936839461326599, "reward_std": 0.43852272629737854, "rewards/final_reward": 1.3607585043371786, "rewards/mask_iou_reward": 0.6803792521685893, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.952464371919632, "rewards/thk_ans_format_reward": 0.984375, "step": 96, "think_completion_length": 57.3125 }, { "clip_ratio": 0.0, "completion_length": 135.25, "epoch": 0.163575042158516, "grad_norm": 4.524169586310404, "kl": 0.06884765625, "learning_rate": 9.672849915682968e-07, "loss": 0.0001, "reward": 2.5702874660491943, "reward_std": 0.33369340747594833, "rewards/final_reward": 0.17149679590443948, "rewards/mask_iou_reward": 0.08574839795221974, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5702873766422272, "rewards/thk_ans_format_reward": 1.0, "step": 97, "think_completion_length": 63.1875 }, { "clip_ratio": 0.0, "completion_length": 151.28125, "epoch": 0.16526138279932545, "grad_norm": 4.8851269262597485, "kl": 0.059326171875, "learning_rate": 9.669477234401348e-07, "loss": 0.0001, "reward": 2.660242795944214, "reward_std": 0.42426833510398865, "rewards/final_reward": 1.1212785651295316, "rewards/mask_iou_reward": 0.5606392825647658, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6758678257465363, "rewards/thk_ans_format_reward": 0.984375, "step": 98, "think_completion_length": 70.15625 }, { "clip_ratio": 0.0, "completion_length": 137.625, "epoch": 0.16694772344013492, "grad_norm": 6.948977823962185, "kl": 0.059326171875, "learning_rate": 9.66610455311973e-07, "loss": 0.0001, "reward": 2.9173457622528076, "reward_std": 0.3428076356649399, "rewards/final_reward": 0.24583723180278247, "rewards/mask_iou_reward": 0.12291861590139123, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9173457622528076, "rewards/thk_ans_format_reward": 1.0, "step": 99, "think_completion_length": 78.34375 }, { "clip_ratio": 0.0, "completion_length": 139.71875, "epoch": 0.16863406408094436, "grad_norm": 7.004314858753663, "kl": 0.076171875, "learning_rate": 9.66273187183811e-07, "loss": 0.0001, "reward": 2.7303144931793213, "reward_std": 0.3506556749343872, "rewards/final_reward": 0.8585279743266268, "rewards/mask_iou_reward": 0.4292639871633134, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7303146123886108, "rewards/thk_ans_format_reward": 1.0, "step": 100, "think_completion_length": 66.375 }, { "clip_ratio": 0.0, "completion_length": 144.484375, "epoch": 0.1703204047217538, "grad_norm": 9.72836401997028, "kl": 0.067626953125, "learning_rate": 9.659359190556491e-07, "loss": 0.0001, "reward": 3.545642137527466, "reward_std": 0.36958497762680054, "rewards/final_reward": 1.3276030857900536, "rewards/mask_iou_reward": 0.6638015428950268, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5456423163414001, "rewards/thk_ans_format_reward": 1.0, "step": 101, "think_completion_length": 68.53125 }, { "clip_ratio": 0.0, "completion_length": 149.421875, "epoch": 0.17200674536256325, "grad_norm": 3.9097650791535936, "kl": 0.06982421875, "learning_rate": 9.655986509274872e-07, "loss": 0.0001, "reward": 2.8055737018585205, "reward_std": 0.30134592577815056, "rewards/final_reward": 1.0802610803244432, "rewards/mask_iou_reward": 0.5401305401622216, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8055737316608429, "rewards/thk_ans_format_reward": 1.0, "step": 102, "think_completion_length": 77.40625 }, { "clip_ratio": 0.0, "completion_length": 142.375, "epoch": 0.1736930860033727, "grad_norm": 12.43241529951544, "kl": 0.0810546875, "learning_rate": 9.652613827993255e-07, "loss": 0.0001, "reward": 2.6046589612960815, "reward_std": 0.36044664680957794, "rewards/final_reward": 0.9996509333597047, "rewards/mask_iou_reward": 0.49982546667985234, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6046590209007263, "rewards/thk_ans_format_reward": 1.0, "step": 103, "think_completion_length": 75.6875 }, { "clip_ratio": 0.0, "completion_length": 158.28125, "epoch": 0.17537942664418213, "grad_norm": 4.524572883875218, "kl": 0.099365234375, "learning_rate": 9.649241146711636e-07, "loss": 0.0001, "reward": 2.2724088430404663, "reward_std": 0.18580714613199234, "rewards/final_reward": 0.41834230934148653, "rewards/mask_iou_reward": 0.20917115467074326, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.27240876853466034, "rewards/thk_ans_format_reward": 1.0, "step": 104, "think_completion_length": 79.34375 }, { "clip_ratio": 0.0, "completion_length": 155.109375, "epoch": 0.17706576728499157, "grad_norm": 14.131641702675266, "kl": 0.06591796875, "learning_rate": 9.645868465430017e-07, "loss": 0.0001, "reward": 2.3781230449676514, "reward_std": 0.26029431354254484, "rewards/final_reward": 0.4826050544302346, "rewards/mask_iou_reward": 0.2413025272151173, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.37812306452542543, "rewards/thk_ans_format_reward": 1.0, "step": 105, "think_completion_length": 97.6875 }, { "clip_ratio": 0.0, "completion_length": 159.21875, "epoch": 0.178752107925801, "grad_norm": 6.944889536872177, "kl": 0.0791015625, "learning_rate": 9.642495784148398e-07, "loss": 0.0001, "reward": 3.2580912113189697, "reward_std": 0.3155831843614578, "rewards/final_reward": 0.9140460384999494, "rewards/mask_iou_reward": 0.4570230192499747, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2580912709236145, "rewards/thk_ans_format_reward": 1.0, "step": 106, "think_completion_length": 103.0625 }, { "clip_ratio": 0.0, "completion_length": 158.5, "epoch": 0.18043844856661045, "grad_norm": 5.649283035789506, "kl": 0.0592041015625, "learning_rate": 9.639123102866778e-07, "loss": 0.0001, "reward": 2.871160387992859, "reward_std": 0.28134259581565857, "rewards/final_reward": 1.0845442635996905, "rewards/mask_iou_reward": 0.5422721317998452, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8711603879928589, "rewards/thk_ans_format_reward": 1.0, "step": 107, "think_completion_length": 86.5 }, { "clip_ratio": 0.0, "completion_length": 158.4375, "epoch": 0.1821247892074199, "grad_norm": 4.979931040893193, "kl": 0.07080078125, "learning_rate": 9.63575042158516e-07, "loss": 0.0001, "reward": 2.6347672939300537, "reward_std": 0.3419078588485718, "rewards/final_reward": 0.7823988325154321, "rewards/mask_iou_reward": 0.39119941625771604, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6347672492265701, "rewards/thk_ans_format_reward": 1.0, "step": 108, "think_completion_length": 81.90625 }, { "clip_ratio": 0.0, "completion_length": 157.765625, "epoch": 0.18381112984822934, "grad_norm": 16.6980716433444, "kl": 0.0577392578125, "learning_rate": 9.63237774030354e-07, "loss": 0.0001, "reward": 2.8260412216186523, "reward_std": 0.2677394151687622, "rewards/final_reward": 0.9963177445880472, "rewards/mask_iou_reward": 0.4981588722940236, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8416662067174911, "rewards/thk_ans_format_reward": 0.984375, "step": 109, "think_completion_length": 88.34375 }, { "clip_ratio": 0.0, "completion_length": 184.828125, "epoch": 0.18549747048903878, "grad_norm": 2.9645999481574634, "kl": 0.05419921875, "learning_rate": 9.629005059021921e-07, "loss": 0.0001, "reward": 2.6315606832504272, "reward_std": 0.3063789587467909, "rewards/final_reward": 0.47596354687676634, "rewards/mask_iou_reward": 0.23798177343838317, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6315606329590082, "rewards/thk_ans_format_reward": 1.0, "step": 110, "think_completion_length": 119.0625 }, { "clip_ratio": 0.0, "completion_length": 170.359375, "epoch": 0.18718381112984822, "grad_norm": 5.187350710181049, "kl": 0.058349609375, "learning_rate": 9.625632377740302e-07, "loss": 0.0001, "reward": 3.09405779838562, "reward_std": 0.15743490681052208, "rewards/final_reward": 1.468626562133811, "rewards/mask_iou_reward": 0.7343132810669055, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0940579175949097, "rewards/thk_ans_format_reward": 1.0, "step": 111, "think_completion_length": 94.0625 }, { "clip_ratio": 0.0, "completion_length": 140.0, "epoch": 0.18887015177065766, "grad_norm": 4.049281276892838, "kl": 0.11572265625, "learning_rate": 9.622259696458685e-07, "loss": 0.0001, "reward": 3.07741117477417, "reward_std": 0.5245492458343506, "rewards/final_reward": 1.250567170226694, "rewards/mask_iou_reward": 0.625283585113347, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0774111449718475, "rewards/thk_ans_format_reward": 1.0, "step": 112, "think_completion_length": 72.59375 }, { "clip_ratio": 0.0, "completion_length": 178.9375, "epoch": 0.1905564924114671, "grad_norm": 8.029055466527632, "kl": 0.054443359375, "learning_rate": 9.618887015177066e-07, "loss": 0.0001, "reward": 2.5914098024368286, "reward_std": 0.4398697763681412, "rewards/final_reward": 0.7285501404788846, "rewards/mask_iou_reward": 0.3642750702394423, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.6070348471403122, "rewards/thk_ans_format_reward": 1.0, "step": 113, "think_completion_length": 78.5 }, { "clip_ratio": 0.0, "completion_length": 156.140625, "epoch": 0.19224283305227655, "grad_norm": 3.086404264380978, "kl": 0.05078125, "learning_rate": 9.615514333895447e-07, "loss": 0.0001, "reward": 2.4672300815582275, "reward_std": 0.29644207656383514, "rewards/final_reward": 0.706325008253994, "rewards/mask_iou_reward": 0.353162504126997, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.4672301113605499, "rewards/thk_ans_format_reward": 1.0, "step": 114, "think_completion_length": 80.53125 }, { "clip_ratio": 0.0, "completion_length": 142.65625, "epoch": 0.19392917369308602, "grad_norm": 4.464912668237362, "kl": 0.050537109375, "learning_rate": 9.612141652613828e-07, "loss": 0.0001, "reward": 2.9204636812210083, "reward_std": 0.1726682484149933, "rewards/final_reward": 1.011700142074487, "rewards/mask_iou_reward": 0.5058500710372436, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9204636216163635, "rewards/thk_ans_format_reward": 1.0, "step": 115, "think_completion_length": 61.75 }, { "clip_ratio": 0.0, "completion_length": 161.609375, "epoch": 0.19561551433389546, "grad_norm": 4.630079087067869, "kl": 0.0567626953125, "learning_rate": 9.608768971332208e-07, "loss": 0.0001, "reward": 2.5220160484313965, "reward_std": 0.2584230601787567, "rewards/final_reward": 0.5448442829958422, "rewards/mask_iou_reward": 0.2724221414979211, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5220160484313965, "rewards/thk_ans_format_reward": 1.0, "step": 116, "think_completion_length": 89.90625 }, { "clip_ratio": 0.0, "completion_length": 155.890625, "epoch": 0.1973018549747049, "grad_norm": 4.621872533032156, "kl": 0.056884765625, "learning_rate": 9.60539629005059e-07, "loss": 0.0001, "reward": 2.9947354793548584, "reward_std": 0.20887230336666107, "rewards/final_reward": 0.9475295686720593, "rewards/mask_iou_reward": 0.47376478433602964, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9947354048490524, "rewards/thk_ans_format_reward": 1.0, "step": 117, "think_completion_length": 80.15625 }, { "clip_ratio": 0.0, "completion_length": 158.421875, "epoch": 0.19898819561551434, "grad_norm": 9.497515578211454, "kl": 0.063720703125, "learning_rate": 9.60202360876897e-07, "loss": 0.0001, "reward": 2.611419439315796, "reward_std": 0.36831772327423096, "rewards/final_reward": 0.7176738346947997, "rewards/mask_iou_reward": 0.35883691734739986, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6114194691181183, "rewards/thk_ans_format_reward": 1.0, "step": 118, "think_completion_length": 98.0 }, { "clip_ratio": 0.0, "completion_length": 150.453125, "epoch": 0.20067453625632378, "grad_norm": 55.096724445429054, "kl": 0.05517578125, "learning_rate": 9.598650927487351e-07, "loss": 0.0001, "reward": 3.0991926193237305, "reward_std": 0.3100406602025032, "rewards/final_reward": 0.8859623609390234, "rewards/mask_iou_reward": 0.4429811804695117, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0991926789283752, "rewards/thk_ans_format_reward": 1.0, "step": 119, "think_completion_length": 90.0625 }, { "clip_ratio": 0.0, "completion_length": 158.78125, "epoch": 0.20236087689713322, "grad_norm": 8.39851702173129, "kl": 0.04931640625, "learning_rate": 9.595278246205734e-07, "loss": 0.0, "reward": 2.797035336494446, "reward_std": 0.1637876257300377, "rewards/final_reward": 0.8639884837569479, "rewards/mask_iou_reward": 0.43199424187847396, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7970353364944458, "rewards/thk_ans_format_reward": 1.0, "step": 120, "think_completion_length": 89.5625 }, { "clip_ratio": 0.0, "completion_length": 159.421875, "epoch": 0.20404721753794267, "grad_norm": 2.534477601211979, "kl": 0.043701171875, "learning_rate": 9.591905564924115e-07, "loss": -0.0001, "reward": 2.9477614164352417, "reward_std": 0.2593442127108574, "rewards/final_reward": 1.5180886726556748, "rewards/mask_iou_reward": 0.7590443363278374, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9477613866329193, "rewards/thk_ans_format_reward": 1.0, "step": 121, "think_completion_length": 81.65625 }, { "clip_ratio": 0.0, "completion_length": 172.609375, "epoch": 0.2057335581787521, "grad_norm": 4.377934435913437, "kl": 0.0535888671875, "learning_rate": 9.588532883642496e-07, "loss": 0.0001, "reward": 2.576362371444702, "reward_std": 0.25838133692741394, "rewards/final_reward": 0.2687840057189522, "rewards/mask_iou_reward": 0.1343920028594761, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5763622224330902, "rewards/thk_ans_format_reward": 1.0, "step": 122, "think_completion_length": 105.25 }, { "clip_ratio": 0.0, "completion_length": 167.9375, "epoch": 0.20741989881956155, "grad_norm": 6.588359922719636, "kl": 0.0567626953125, "learning_rate": 9.585160202360877e-07, "loss": 0.0001, "reward": 2.8375957012176514, "reward_std": 0.4054133892059326, "rewards/final_reward": 0.9836463211897768, "rewards/mask_iou_reward": 0.4918231605948884, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8375958502292633, "rewards/thk_ans_format_reward": 1.0, "step": 123, "think_completion_length": 96.4375 }, { "clip_ratio": 0.0, "completion_length": 162.671875, "epoch": 0.209106239460371, "grad_norm": 10.814198077246532, "kl": 0.0535888671875, "learning_rate": 9.581787521079258e-07, "loss": 0.0001, "reward": 2.4431896209716797, "reward_std": 0.2660168632864952, "rewards/final_reward": 0.5794934557994321, "rewards/mask_iou_reward": 0.28974672789971606, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.44318948313593864, "rewards/thk_ans_format_reward": 1.0, "step": 124, "think_completion_length": 79.25 }, { "clip_ratio": 0.0, "completion_length": 159.328125, "epoch": 0.21079258010118043, "grad_norm": 4.274399889954174, "kl": 0.0465087890625, "learning_rate": 9.578414839797638e-07, "loss": 0.0, "reward": 3.0381346940994263, "reward_std": 0.22179779410362244, "rewards/final_reward": 1.4394426651939196, "rewards/mask_iou_reward": 0.7197213325969598, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0381346344947815, "rewards/thk_ans_format_reward": 1.0, "step": 125, "think_completion_length": 102.15625 }, { "clip_ratio": 0.0, "completion_length": 162.6875, "epoch": 0.21247892074198987, "grad_norm": 8.152408435882341, "kl": 0.0556640625, "learning_rate": 9.57504215851602e-07, "loss": 0.0001, "reward": 2.923964023590088, "reward_std": 0.2563727870583534, "rewards/final_reward": 1.089894641666783, "rewards/mask_iou_reward": 0.5449473208333915, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9239640831947327, "rewards/thk_ans_format_reward": 1.0, "step": 126, "think_completion_length": 85.6875 }, { "clip_ratio": 0.0, "completion_length": 154.265625, "epoch": 0.21416526138279932, "grad_norm": 4.449214680152263, "kl": 0.0687255859375, "learning_rate": 9.5716694772344e-07, "loss": 0.0001, "reward": 2.740877628326416, "reward_std": 0.46522799134254456, "rewards/final_reward": 1.1671571540552383, "rewards/mask_iou_reward": 0.5835785770276192, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7565025985240936, "rewards/thk_ans_format_reward": 0.984375, "step": 127, "think_completion_length": 91.5 }, { "clip_ratio": 0.0, "completion_length": 153.15625, "epoch": 0.21585160202360876, "grad_norm": 4.17184352546643, "kl": 0.060546875, "learning_rate": 9.56829679595278e-07, "loss": 0.0001, "reward": 2.8540847301483154, "reward_std": 0.4098881930112839, "rewards/final_reward": 0.3388112149192265, "rewards/mask_iou_reward": 0.16940560745961325, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.854084700345993, "rewards/thk_ans_format_reward": 1.0, "step": 128, "think_completion_length": 88.78125 }, { "clip_ratio": 0.0, "completion_length": 157.34375, "epoch": 0.2175379426644182, "grad_norm": 15.065555837938753, "kl": 0.072265625, "learning_rate": 9.564924114671164e-07, "loss": 0.0001, "reward": 3.1309502124786377, "reward_std": 0.4588165432214737, "rewards/final_reward": 1.147073274599184, "rewards/mask_iou_reward": 0.573536637299592, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1309503316879272, "rewards/thk_ans_format_reward": 1.0, "step": 129, "think_completion_length": 92.3125 }, { "clip_ratio": 0.0, "completion_length": 161.9375, "epoch": 0.21922428330522767, "grad_norm": 22.465927435670828, "kl": 0.08154296875, "learning_rate": 9.561551433389545e-07, "loss": 0.0001, "reward": 2.7864497900009155, "reward_std": 0.31874626129865646, "rewards/final_reward": 0.4232256065266388, "rewards/mask_iou_reward": 0.2116128032633194, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7864498198032379, "rewards/thk_ans_format_reward": 1.0, "step": 130, "think_completion_length": 92.53125 }, { "clip_ratio": 0.0, "completion_length": 140.3125, "epoch": 0.2209106239460371, "grad_norm": 6.419347933512376, "kl": 0.077880859375, "learning_rate": 9.558178752107926e-07, "loss": 0.0001, "reward": 2.908892869949341, "reward_std": 0.44153931736946106, "rewards/final_reward": 0.5672732777281655, "rewards/mask_iou_reward": 0.28363663886408275, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9088928699493408, "rewards/thk_ans_format_reward": 1.0, "step": 131, "think_completion_length": 71.71875 }, { "clip_ratio": 0.0, "completion_length": 164.59375, "epoch": 0.22259696458684655, "grad_norm": 38.99623994278225, "kl": 0.086181640625, "learning_rate": 9.554806070826307e-07, "loss": 0.0001, "reward": 3.0554357767105103, "reward_std": 0.3175787627696991, "rewards/final_reward": 0.8004080636881445, "rewards/mask_iou_reward": 0.40020403184407227, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.055435687303543, "rewards/thk_ans_format_reward": 1.0, "step": 132, "think_completion_length": 78.84375 }, { "clip_ratio": 0.0, "completion_length": 145.65625, "epoch": 0.224283305227656, "grad_norm": 3.0018346933655375, "kl": 0.087646484375, "learning_rate": 9.551433389544688e-07, "loss": 0.0001, "reward": 2.4450541734695435, "reward_std": 0.34064342081546783, "rewards/final_reward": 0.4599661168873327, "rewards/mask_iou_reward": 0.22998305844366634, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.46067917346954346, "rewards/thk_ans_format_reward": 0.984375, "step": 133, "think_completion_length": 79.65625 }, { "clip_ratio": 0.0, "completion_length": 161.625, "epoch": 0.22596964586846544, "grad_norm": 9.578539076710266, "kl": 0.0859375, "learning_rate": 9.548060708263068e-07, "loss": 0.0001, "reward": 2.931985855102539, "reward_std": 0.48415741324424744, "rewards/final_reward": 0.7854134764268852, "rewards/mask_iou_reward": 0.3927067382134426, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9319857358932495, "rewards/thk_ans_format_reward": 1.0, "step": 134, "think_completion_length": 92.0625 }, { "clip_ratio": 0.0, "completion_length": 155.484375, "epoch": 0.22765598650927488, "grad_norm": 3.9455952839664166, "kl": 0.0791015625, "learning_rate": 9.54468802698145e-07, "loss": 0.0001, "reward": 2.699129104614258, "reward_std": 0.46679961681365967, "rewards/final_reward": 0.6493981932567956, "rewards/mask_iou_reward": 0.3246990966283978, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6991291344165802, "rewards/thk_ans_format_reward": 1.0, "step": 135, "think_completion_length": 77.9375 }, { "clip_ratio": 0.0, "completion_length": 178.671875, "epoch": 0.22934232715008432, "grad_norm": 26.084465650772835, "kl": 0.6328125, "learning_rate": 9.54131534569983e-07, "loss": 0.0006, "reward": 2.6946918964385986, "reward_std": 0.3354320228099823, "rewards/final_reward": 0.917651972879823, "rewards/mask_iou_reward": 0.4588259864399115, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7103168219327927, "rewards/thk_ans_format_reward": 0.984375, "step": 136, "think_completion_length": 79.71875 }, { "clip_ratio": 0.0, "completion_length": 151.171875, "epoch": 0.23102866779089376, "grad_norm": 5.895893689415301, "kl": 0.078125, "learning_rate": 9.53794266441821e-07, "loss": 0.0001, "reward": 2.541081666946411, "reward_std": 0.22961698472499847, "rewards/final_reward": 0.9253931676236163, "rewards/mask_iou_reward": 0.46269658381180817, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5410817265510559, "rewards/thk_ans_format_reward": 1.0, "step": 137, "think_completion_length": 87.0 }, { "clip_ratio": 0.0, "completion_length": 157.046875, "epoch": 0.2327150084317032, "grad_norm": 4.965800154740723, "kl": 0.08056640625, "learning_rate": 9.534569983136593e-07, "loss": 0.0001, "reward": 2.9825655221939087, "reward_std": 0.286454439163208, "rewards/final_reward": 1.7143570051156443, "rewards/mask_iou_reward": 0.8571785025578221, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9981906414031982, "rewards/thk_ans_format_reward": 0.984375, "step": 138, "think_completion_length": 92.25 }, { "clip_ratio": 0.0, "completion_length": 149.640625, "epoch": 0.23440134907251264, "grad_norm": 3.891606818741714, "kl": 0.1005859375, "learning_rate": 9.531197301854974e-07, "loss": 0.0001, "reward": 2.3226908445358276, "reward_std": 0.2071321550756693, "rewards/final_reward": 0.4069366052566161, "rewards/mask_iou_reward": 0.20346830262830806, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.32269074441865087, "rewards/thk_ans_format_reward": 1.0, "step": 139, "think_completion_length": 84.375 }, { "clip_ratio": 0.0, "completion_length": 142.65625, "epoch": 0.23608768971332209, "grad_norm": 9.336418443843108, "kl": 0.087890625, "learning_rate": 9.527824620573356e-07, "loss": 0.0001, "reward": 3.410194993019104, "reward_std": 0.25575824826955795, "rewards/final_reward": 1.5624018070481909, "rewards/mask_iou_reward": 0.7812009035240954, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4101950526237488, "rewards/thk_ans_format_reward": 1.0, "step": 140, "think_completion_length": 77.5625 }, { "clip_ratio": 0.0, "completion_length": 156.90625, "epoch": 0.23777403035413153, "grad_norm": 6.200561227361159, "kl": 0.09375, "learning_rate": 9.524451939291737e-07, "loss": 0.0001, "reward": 2.8541425466537476, "reward_std": 0.6211456060409546, "rewards/final_reward": 1.2935642400096632, "rewards/mask_iou_reward": 0.6467821200048316, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 0.8853925466537476, "rewards/thk_ans_format_reward": 1.0, "step": 141, "think_completion_length": 78.5625 }, { "clip_ratio": 0.0, "completion_length": 142.25, "epoch": 0.23946037099494097, "grad_norm": 10.65132715665166, "kl": 0.093017578125, "learning_rate": 9.521079258010118e-07, "loss": 0.0001, "reward": 2.95055615901947, "reward_std": 0.5230741798877716, "rewards/final_reward": 0.6756522948538228, "rewards/mask_iou_reward": 0.3378261474269114, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9505561292171478, "rewards/thk_ans_format_reward": 1.0, "step": 142, "think_completion_length": 65.75 }, { "clip_ratio": 0.0, "completion_length": 145.953125, "epoch": 0.2411467116357504, "grad_norm": 5.2077714220137405, "kl": 0.097900390625, "learning_rate": 9.517706576728499e-07, "loss": 0.0001, "reward": 3.337947726249695, "reward_std": 0.2577322721481323, "rewards/final_reward": 0.8645473082831757, "rewards/mask_iou_reward": 0.43227365414158786, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.33794766664505, "rewards/thk_ans_format_reward": 1.0, "step": 143, "think_completion_length": 77.40625 }, { "clip_ratio": 0.0, "completion_length": 129.03125, "epoch": 0.24283305227655985, "grad_norm": 8.046601536672426, "kl": 0.1103515625, "learning_rate": 9.51433389544688e-07, "loss": 0.0001, "reward": 2.8948484659194946, "reward_std": 0.25463247299194336, "rewards/final_reward": 0.524457777173691, "rewards/mask_iou_reward": 0.2622288885868455, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9104736149311066, "rewards/thk_ans_format_reward": 0.984375, "step": 144, "think_completion_length": 62.1875 }, { "clip_ratio": 0.0, "completion_length": 144.21875, "epoch": 0.24451939291736932, "grad_norm": 5.0074848777872445, "kl": 0.103759765625, "learning_rate": 9.51096121416526e-07, "loss": 0.0001, "reward": 2.683648943901062, "reward_std": 0.3750077337026596, "rewards/final_reward": 0.8593287375390414, "rewards/mask_iou_reward": 0.4296643687695207, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6836489215493202, "rewards/thk_ans_format_reward": 1.0, "step": 145, "think_completion_length": 73.0 }, { "clip_ratio": 0.0, "completion_length": 143.6875, "epoch": 0.24620573355817876, "grad_norm": 4.963438244921841, "kl": 0.1044921875, "learning_rate": 9.507588532883642e-07, "loss": 0.0001, "reward": 2.661731004714966, "reward_std": 0.3136795163154602, "rewards/final_reward": 0.4139575300345308, "rewards/mask_iou_reward": 0.2069787650172654, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6617311537265778, "rewards/thk_ans_format_reward": 1.0, "step": 146, "think_completion_length": 61.25 }, { "clip_ratio": 0.0, "completion_length": 153.921875, "epoch": 0.2478920741989882, "grad_norm": 9.38267745831431, "kl": 0.11376953125, "learning_rate": 9.504215851602023e-07, "loss": 0.0001, "reward": 3.2500319480895996, "reward_std": 0.46346913278102875, "rewards/final_reward": 1.0314903289685486, "rewards/mask_iou_reward": 0.5157451644842743, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2500320076942444, "rewards/thk_ans_format_reward": 1.0, "step": 147, "think_completion_length": 72.5625 }, { "clip_ratio": 0.0, "completion_length": 139.125, "epoch": 0.24957841483979765, "grad_norm": 12.293398423965753, "kl": 0.099853515625, "learning_rate": 9.500843170320404e-07, "loss": 0.0001, "reward": 3.3502708673477173, "reward_std": 0.4048406034708023, "rewards/final_reward": 1.096210174595348, "rewards/mask_iou_reward": 0.548105087297674, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3502709865570068, "rewards/thk_ans_format_reward": 1.0, "step": 148, "think_completion_length": 67.34375 }, { "clip_ratio": 0.0, "completion_length": 138.65625, "epoch": 0.25126475548060706, "grad_norm": 5.36062250866705, "kl": 0.1181640625, "learning_rate": 9.497470489038786e-07, "loss": 0.0001, "reward": 2.4620732069015503, "reward_std": 0.2523811161518097, "rewards/final_reward": 0.3980805396072795, "rewards/mask_iou_reward": 0.19904026980363976, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.4620732367038727, "rewards/thk_ans_format_reward": 1.0, "step": 149, "think_completion_length": 69.0625 }, { "clip_ratio": 0.0, "completion_length": 137.734375, "epoch": 0.25295109612141653, "grad_norm": 4.872750940333902, "kl": 0.10546875, "learning_rate": 9.494097807757167e-07, "loss": 0.0001, "reward": 3.1407504081726074, "reward_std": 0.1939391940832138, "rewards/final_reward": 1.8599050823740502, "rewards/mask_iou_reward": 0.9299525411870251, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1407504081726074, "rewards/thk_ans_format_reward": 1.0, "step": 150, "think_completion_length": 65.53125 }, { "clip_ratio": 0.0, "completion_length": 147.765625, "epoch": 0.25463743676222594, "grad_norm": 3.2884978489457337, "kl": 0.099609375, "learning_rate": 9.490725126475548e-07, "loss": 0.0001, "reward": 2.5347702503204346, "reward_std": 0.2712481617927551, "rewards/final_reward": 0.6935733601988581, "rewards/mask_iou_reward": 0.34678668009942903, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5347701907157898, "rewards/thk_ans_format_reward": 1.0, "step": 151, "think_completion_length": 73.0625 }, { "clip_ratio": 0.0, "completion_length": 139.734375, "epoch": 0.2563237774030354, "grad_norm": 5.393263085729027, "kl": 0.10888671875, "learning_rate": 9.487352445193929e-07, "loss": 0.0001, "reward": 2.9217779636383057, "reward_std": 0.3797933831810951, "rewards/final_reward": 1.345318416221491, "rewards/mask_iou_reward": 0.6726592081107455, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9217780828475952, "rewards/thk_ans_format_reward": 1.0, "step": 152, "think_completion_length": 76.34375 }, { "clip_ratio": 0.0, "completion_length": 142.328125, "epoch": 0.2580101180438449, "grad_norm": 5.26665151298103, "kl": 0.1240234375, "learning_rate": 9.48397976391231e-07, "loss": 0.0001, "reward": 3.006482243537903, "reward_std": 0.37782153487205505, "rewards/final_reward": 1.1417938718538108, "rewards/mask_iou_reward": 0.5708969359269054, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0064822435379028, "rewards/thk_ans_format_reward": 1.0, "step": 153, "think_completion_length": 69.125 }, { "clip_ratio": 0.0, "completion_length": 132.640625, "epoch": 0.2596964586846543, "grad_norm": 6.030885434029907, "kl": 0.111572265625, "learning_rate": 9.48060708263069e-07, "loss": 0.0001, "reward": 2.444668173789978, "reward_std": 0.2628085985779762, "rewards/final_reward": 0.01806783909522232, "rewards/mask_iou_reward": 0.00903391954761116, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.4446682594716549, "rewards/thk_ans_format_reward": 1.0, "step": 154, "think_completion_length": 65.6875 }, { "clip_ratio": 0.0, "completion_length": 144.703125, "epoch": 0.26138279932546377, "grad_norm": 13.520169048491766, "kl": 0.126953125, "learning_rate": 9.477234401349072e-07, "loss": 0.0001, "reward": 2.868806838989258, "reward_std": 0.4705194979906082, "rewards/final_reward": 0.8561282192138441, "rewards/mask_iou_reward": 0.42806410960692204, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8688068389892578, "rewards/thk_ans_format_reward": 1.0, "step": 155, "think_completion_length": 55.28125 }, { "clip_ratio": 0.0, "completion_length": 132.90625, "epoch": 0.2630691399662732, "grad_norm": 13.417822343339733, "kl": 0.122314453125, "learning_rate": 9.473861720067453e-07, "loss": 0.0001, "reward": 3.0076318979263306, "reward_std": 0.303857646882534, "rewards/final_reward": 0.8894650284318986, "rewards/mask_iou_reward": 0.4447325142159493, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0076318979263306, "rewards/thk_ans_format_reward": 1.0, "step": 156, "think_completion_length": 72.625 }, { "clip_ratio": 0.0, "completion_length": 136.6875, "epoch": 0.26475548060708265, "grad_norm": 8.72973658808703, "kl": 0.14404296875, "learning_rate": 9.470489038785834e-07, "loss": 0.0001, "reward": 2.947430729866028, "reward_std": 0.15501541644334793, "rewards/final_reward": 0.2652462593609649, "rewards/mask_iou_reward": 0.13262312968048245, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9474307894706726, "rewards/thk_ans_format_reward": 1.0, "step": 157, "think_completion_length": 72.40625 }, { "clip_ratio": 0.0, "completion_length": 138.046875, "epoch": 0.26644182124789206, "grad_norm": 6.762615497616932, "kl": 0.11279296875, "learning_rate": 9.467116357504216e-07, "loss": 0.0001, "reward": 3.3199820518493652, "reward_std": 0.25218017399311066, "rewards/final_reward": 1.0872463613153267, "rewards/mask_iou_reward": 0.5436231806576634, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.31998211145401, "rewards/thk_ans_format_reward": 1.0, "step": 158, "think_completion_length": 67.40625 }, { "clip_ratio": 0.0, "completion_length": 151.828125, "epoch": 0.26812816188870153, "grad_norm": 5.092870880513967, "kl": 0.10546875, "learning_rate": 9.463743676222597e-07, "loss": 0.0001, "reward": 2.7609550952911377, "reward_std": 0.3843376636505127, "rewards/final_reward": 0.8584499362887056, "rewards/mask_iou_reward": 0.4292249681443528, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7609550952911377, "rewards/thk_ans_format_reward": 1.0, "step": 159, "think_completion_length": 78.4375 }, { "clip_ratio": 0.0, "completion_length": 140.0625, "epoch": 0.26981450252951095, "grad_norm": 5.147772569205811, "kl": 0.1318359375, "learning_rate": 9.460370994940977e-07, "loss": 0.0001, "reward": 3.031912684440613, "reward_std": 0.4968326687812805, "rewards/final_reward": 0.9247348266948624, "rewards/mask_iou_reward": 0.4623674133474312, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0319127440452576, "rewards/thk_ans_format_reward": 1.0, "step": 160, "think_completion_length": 64.21875 }, { "clip_ratio": 0.0, "completion_length": 144.40625, "epoch": 0.2715008431703204, "grad_norm": 4.8651031750590406, "kl": 0.118408203125, "learning_rate": 9.456998313659359e-07, "loss": 0.0001, "reward": 2.696570873260498, "reward_std": 0.3566492199897766, "rewards/final_reward": 0.5540198197293646, "rewards/mask_iou_reward": 0.2770099098646823, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.7121958136558533, "rewards/thk_ans_format_reward": 1.0, "step": 161, "think_completion_length": 73.53125 }, { "clip_ratio": 0.0, "completion_length": 135.1875, "epoch": 0.27318718381112983, "grad_norm": 10.86627896290096, "kl": 0.12841796875, "learning_rate": 9.453625632377739e-07, "loss": 0.0001, "reward": 2.649171829223633, "reward_std": 0.3913609981536865, "rewards/final_reward": 0.5830436964675607, "rewards/mask_iou_reward": 0.29152184823378036, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6491719186306, "rewards/thk_ans_format_reward": 1.0, "step": 162, "think_completion_length": 65.90625 }, { "clip_ratio": 0.0, "completion_length": 142.984375, "epoch": 0.2748735244519393, "grad_norm": 5.681694741511601, "kl": 0.110107421875, "learning_rate": 9.450252951096121e-07, "loss": 0.0001, "reward": 2.69570529460907, "reward_std": 0.3432028442621231, "rewards/final_reward": 0.9890574993933601, "rewards/mask_iou_reward": 0.49452874969668004, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.695705458521843, "rewards/thk_ans_format_reward": 1.0, "step": 163, "think_completion_length": 63.125 }, { "clip_ratio": 0.0, "completion_length": 149.46875, "epoch": 0.2765598650927487, "grad_norm": 21.25176205198972, "kl": 0.1181640625, "learning_rate": 9.446880269814502e-07, "loss": 0.0001, "reward": 2.522574782371521, "reward_std": 0.27491385489702225, "rewards/final_reward": 0.2878840173621903, "rewards/mask_iou_reward": 0.14394200868109516, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.522574707865715, "rewards/thk_ans_format_reward": 1.0, "step": 164, "think_completion_length": 58.875 }, { "clip_ratio": 0.0, "completion_length": 135.265625, "epoch": 0.2782462057335582, "grad_norm": 5.401276166916328, "kl": 0.13037109375, "learning_rate": 9.443507588532883e-07, "loss": 0.0001, "reward": 2.662353754043579, "reward_std": 0.2629779279232025, "rewards/final_reward": 1.0439006900901509, "rewards/mask_iou_reward": 0.5219503450450754, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6779787838459015, "rewards/thk_ans_format_reward": 0.984375, "step": 165, "think_completion_length": 60.375 }, { "clip_ratio": 0.0, "completion_length": 138.21875, "epoch": 0.2799325463743676, "grad_norm": 10.639716221887245, "kl": 0.1279296875, "learning_rate": 9.440134907251265e-07, "loss": 0.0001, "reward": 2.6253796815872192, "reward_std": 0.2937234491109848, "rewards/final_reward": 0.8173267802429207, "rewards/mask_iou_reward": 0.40866339012146036, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6253797858953476, "rewards/thk_ans_format_reward": 1.0, "step": 166, "think_completion_length": 61.53125 }, { "clip_ratio": 0.0, "completion_length": 139.46875, "epoch": 0.28161888701517707, "grad_norm": 4.376668469291415, "kl": 0.12744140625, "learning_rate": 9.436762225969646e-07, "loss": 0.0001, "reward": 2.68786883354187, "reward_std": 0.1904464066028595, "rewards/final_reward": 0.1009874731727318, "rewards/mask_iou_reward": 0.0504937365863659, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.687868744134903, "rewards/thk_ans_format_reward": 1.0, "step": 167, "think_completion_length": 70.59375 }, { "clip_ratio": 0.0, "completion_length": 134.796875, "epoch": 0.28330522765598654, "grad_norm": 4.162857484309753, "kl": 0.1162109375, "learning_rate": 9.433389544688027e-07, "loss": 0.0001, "reward": 2.4673460721969604, "reward_std": 0.14471174776554108, "rewards/final_reward": 0.6801467773036485, "rewards/mask_iou_reward": 0.34007338865182424, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.4673460125923157, "rewards/thk_ans_format_reward": 1.0, "step": 168, "think_completion_length": 70.3125 }, { "clip_ratio": 0.0, "completion_length": 139.5625, "epoch": 0.28499156829679595, "grad_norm": 5.8655442391890285, "kl": 0.2138671875, "learning_rate": 9.430016863406409e-07, "loss": 0.0002, "reward": 2.7184778451919556, "reward_std": 0.3520616292953491, "rewards/final_reward": 1.123103388258644, "rewards/mask_iou_reward": 0.561551694129322, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7341028153896332, "rewards/thk_ans_format_reward": 0.984375, "step": 169, "think_completion_length": 62.96875 }, { "clip_ratio": 0.0, "completion_length": 131.796875, "epoch": 0.2866779089376054, "grad_norm": 11.32268035590721, "kl": 0.12255859375, "learning_rate": 9.426644182124788e-07, "loss": 0.0001, "reward": 2.7999212741851807, "reward_std": 0.3271795064210892, "rewards/final_reward": 0.5870644804219973, "rewards/mask_iou_reward": 0.29353224021099866, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7999212741851807, "rewards/thk_ans_format_reward": 1.0, "step": 170, "think_completion_length": 60.4375 }, { "clip_ratio": 0.0, "completion_length": 131.84375, "epoch": 0.28836424957841483, "grad_norm": 8.834700309247399, "kl": 0.11962890625, "learning_rate": 9.423271500843169e-07, "loss": 0.0001, "reward": 2.8714534044265747, "reward_std": 0.44849054515361786, "rewards/final_reward": 1.0860856449425818, "rewards/mask_iou_reward": 0.5430428224712909, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8714532852172852, "rewards/thk_ans_format_reward": 1.0, "step": 171, "think_completion_length": 70.5625 }, { "clip_ratio": 0.0, "completion_length": 131.796875, "epoch": 0.2900505902192243, "grad_norm": 5.198743846520378, "kl": 0.1474609375, "learning_rate": 9.419898819561551e-07, "loss": 0.0001, "reward": 3.1690046787261963, "reward_std": 0.13895303010940552, "rewards/final_reward": 0.9271436176614136, "rewards/mask_iou_reward": 0.4635718088307068, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.169004738330841, "rewards/thk_ans_format_reward": 1.0, "step": 172, "think_completion_length": 61.5625 }, { "clip_ratio": 0.0, "completion_length": 139.859375, "epoch": 0.2917369308600337, "grad_norm": 7.211165474999571, "kl": 0.13525390625, "learning_rate": 9.416526138279932e-07, "loss": 0.0001, "reward": 3.202454924583435, "reward_std": 0.4479510486125946, "rewards/final_reward": 1.5023353211161545, "rewards/mask_iou_reward": 0.7511676605580773, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2024548053741455, "rewards/thk_ans_format_reward": 1.0, "step": 173, "think_completion_length": 69.375 }, { "clip_ratio": 0.0, "completion_length": 131.953125, "epoch": 0.2934232715008432, "grad_norm": 4.313657233067247, "kl": 0.12841796875, "learning_rate": 9.413153456998313e-07, "loss": 0.0001, "reward": 2.9632985591888428, "reward_std": 0.31960703432559967, "rewards/final_reward": 0.4206897495451917, "rewards/mask_iou_reward": 0.21034487477259586, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9632984697818756, "rewards/thk_ans_format_reward": 1.0, "step": 174, "think_completion_length": 64.90625 }, { "clip_ratio": 0.0, "completion_length": 141.203125, "epoch": 0.2951096121416526, "grad_norm": 7.5441699247531675, "kl": 0.128173828125, "learning_rate": 9.409780775716695e-07, "loss": 0.0001, "reward": 2.729974389076233, "reward_std": 0.20594902336597443, "rewards/final_reward": 1.2675996908269855, "rewards/mask_iou_reward": 0.6337998454134928, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7299742102622986, "rewards/thk_ans_format_reward": 1.0, "step": 175, "think_completion_length": 59.46875 }, { "clip_ratio": 0.0, "completion_length": 133.890625, "epoch": 0.29679595278246207, "grad_norm": 5.8778504219298675, "kl": 0.14404296875, "learning_rate": 9.406408094435076e-07, "loss": 0.0001, "reward": 3.54919171333313, "reward_std": 0.20388521254062653, "rewards/final_reward": 1.228838939231336, "rewards/mask_iou_reward": 0.614419469615668, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.549191653728485, "rewards/thk_ans_format_reward": 1.0, "step": 176, "think_completion_length": 62.0 }, { "clip_ratio": 0.0, "completion_length": 127.34375, "epoch": 0.2984822934232715, "grad_norm": 3.521078841102136, "kl": 0.13330078125, "learning_rate": 9.403035413153457e-07, "loss": 0.0001, "reward": 3.1563611030578613, "reward_std": 0.3507531061768532, "rewards/final_reward": 1.1977695530795711, "rewards/mask_iou_reward": 0.5988847765397856, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1563609838485718, "rewards/thk_ans_format_reward": 1.0, "step": 177, "think_completion_length": 59.03125 }, { "clip_ratio": 0.0, "completion_length": 125.890625, "epoch": 0.30016863406408095, "grad_norm": 5.740861912339471, "kl": 0.1357421875, "learning_rate": 9.399662731871839e-07, "loss": 0.0001, "reward": 2.9897106885910034, "reward_std": 0.4273761063814163, "rewards/final_reward": 1.2805484639491822, "rewards/mask_iou_reward": 0.6402742319745911, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.0209608376026154, "rewards/thk_ans_format_reward": 0.984375, "step": 178, "think_completion_length": 56.0625 }, { "clip_ratio": 0.0, "completion_length": 129.59375, "epoch": 0.30185497470489037, "grad_norm": 9.879177378853775, "kl": 0.16455078125, "learning_rate": 9.396290050590218e-07, "loss": 0.0002, "reward": 2.9865111112594604, "reward_std": 0.350925549864769, "rewards/final_reward": 1.4638744302432896, "rewards/mask_iou_reward": 0.7319372151216448, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.98651123046875, "rewards/thk_ans_format_reward": 1.0, "step": 179, "think_completion_length": 59.25 }, { "clip_ratio": 0.0, "completion_length": 127.21875, "epoch": 0.30354131534569984, "grad_norm": 8.287354511345475, "kl": 0.144775390625, "learning_rate": 9.392917369308599e-07, "loss": 0.0001, "reward": 2.5636744499206543, "reward_std": 0.4225110709667206, "rewards/final_reward": 0.8577971128846306, "rewards/mask_iou_reward": 0.4288985564423153, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5636745691299438, "rewards/thk_ans_format_reward": 1.0, "step": 180, "think_completion_length": 54.75 }, { "clip_ratio": 0.0, "completion_length": 129.84375, "epoch": 0.30522765598650925, "grad_norm": 4.22167133798815, "kl": 0.15380859375, "learning_rate": 9.389544688026981e-07, "loss": 0.0002, "reward": 3.118129253387451, "reward_std": 0.22842250019311905, "rewards/final_reward": 0.9386232568832861, "rewards/mask_iou_reward": 0.46931162844164304, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.118129312992096, "rewards/thk_ans_format_reward": 1.0, "step": 181, "think_completion_length": 61.6875 }, { "clip_ratio": 0.0, "completion_length": 132.34375, "epoch": 0.3069139966273187, "grad_norm": 4.843783601225924, "kl": 0.14697265625, "learning_rate": 9.386172006745362e-07, "loss": 0.0001, "reward": 3.015873432159424, "reward_std": 0.4006500840187073, "rewards/final_reward": 1.1393304081924156, "rewards/mask_iou_reward": 0.5696652040962078, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0158734321594238, "rewards/thk_ans_format_reward": 1.0, "step": 182, "think_completion_length": 58.5 }, { "clip_ratio": 0.0, "completion_length": 122.046875, "epoch": 0.3086003372681282, "grad_norm": 3.3709277392436303, "kl": 0.1494140625, "learning_rate": 9.382799325463743e-07, "loss": 0.0001, "reward": 2.794854760169983, "reward_std": 0.2856273353099823, "rewards/final_reward": 0.6165165264207468, "rewards/mask_iou_reward": 0.3082582632103734, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7948549091815948, "rewards/thk_ans_format_reward": 1.0, "step": 183, "think_completion_length": 59.65625 }, { "clip_ratio": 0.0, "completion_length": 129.75, "epoch": 0.3102866779089376, "grad_norm": 6.530725976667846, "kl": 0.19873046875, "learning_rate": 9.379426644182125e-07, "loss": 0.0002, "reward": 3.2726598978042603, "reward_std": 0.2622811198234558, "rewards/final_reward": 0.9330878932926485, "rewards/mask_iou_reward": 0.46654394664632426, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2726598978042603, "rewards/thk_ans_format_reward": 1.0, "step": 184, "think_completion_length": 59.25 }, { "clip_ratio": 0.0, "completion_length": 124.6875, "epoch": 0.31197301854974707, "grad_norm": 5.970535259420742, "kl": 0.145263671875, "learning_rate": 9.376053962900506e-07, "loss": 0.0001, "reward": 2.518738865852356, "reward_std": 0.3007535934448242, "rewards/final_reward": 0.295968154432344, "rewards/mask_iou_reward": 0.147984077216172, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5187387764453888, "rewards/thk_ans_format_reward": 1.0, "step": 185, "think_completion_length": 57.875 }, { "clip_ratio": 0.0, "completion_length": 133.40625, "epoch": 0.3136593591905565, "grad_norm": 7.038454246978713, "kl": 0.1533203125, "learning_rate": 9.372681281618887e-07, "loss": 0.0002, "reward": 2.8668417930603027, "reward_std": 0.34225399792194366, "rewards/final_reward": 0.5465141226757314, "rewards/mask_iou_reward": 0.2732570613378657, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.8824667930603027, "rewards/thk_ans_format_reward": 1.0, "step": 186, "think_completion_length": 59.5 }, { "clip_ratio": 0.0, "completion_length": 129.546875, "epoch": 0.31534569983136596, "grad_norm": 18.462270438443714, "kl": 0.14599609375, "learning_rate": 9.369308600337267e-07, "loss": 0.0001, "reward": 3.488566040992737, "reward_std": 0.20074902474880219, "rewards/final_reward": 1.8745138450901717, "rewards/mask_iou_reward": 0.9372569225450859, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4885660409927368, "rewards/thk_ans_format_reward": 1.0, "step": 187, "think_completion_length": 58.40625 }, { "clip_ratio": 0.0, "completion_length": 127.390625, "epoch": 0.31703204047217537, "grad_norm": 3.9471191401956762, "kl": 0.1474609375, "learning_rate": 9.365935919055648e-07, "loss": 0.0001, "reward": 3.4551355838775635, "reward_std": 0.29292161762714386, "rewards/final_reward": 1.4607345589720289, "rewards/mask_iou_reward": 0.7303672794860144, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4551355838775635, "rewards/thk_ans_format_reward": 1.0, "step": 188, "think_completion_length": 62.28125 }, { "clip_ratio": 0.0, "completion_length": 129.796875, "epoch": 0.31871838111298484, "grad_norm": 4.814018672200347, "kl": 0.134765625, "learning_rate": 9.36256323777403e-07, "loss": 0.0001, "reward": 2.498222589492798, "reward_std": 0.34490717202425003, "rewards/final_reward": 0.5208702977899072, "rewards/mask_iou_reward": 0.2604351488949536, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.49822261929512024, "rewards/thk_ans_format_reward": 1.0, "step": 189, "think_completion_length": 64.5 }, { "clip_ratio": 0.0, "completion_length": 128.421875, "epoch": 0.32040472175379425, "grad_norm": 4.210162438049502, "kl": 0.1533203125, "learning_rate": 9.359190556492411e-07, "loss": 0.0002, "reward": 2.855989098548889, "reward_std": 0.2442128323018551, "rewards/final_reward": 1.5877153634620145, "rewards/mask_iou_reward": 0.7938576817310072, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8559889495372772, "rewards/thk_ans_format_reward": 1.0, "step": 190, "think_completion_length": 63.28125 }, { "clip_ratio": 0.0, "completion_length": 127.1875, "epoch": 0.3220910623946037, "grad_norm": 5.6369854227467835, "kl": 0.1591796875, "learning_rate": 9.355817875210792e-07, "loss": 0.0002, "reward": 2.8796567916870117, "reward_std": 0.24023566395044327, "rewards/final_reward": 1.531722567489063, "rewards/mask_iou_reward": 0.7658612837445316, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.879656970500946, "rewards/thk_ans_format_reward": 1.0, "step": 191, "think_completion_length": 55.59375 }, { "clip_ratio": 0.0, "completion_length": 128.34375, "epoch": 0.32377740303541314, "grad_norm": 6.757754073902499, "kl": 0.142578125, "learning_rate": 9.352445193929174e-07, "loss": 0.0001, "reward": 3.378178596496582, "reward_std": 0.29115166515111923, "rewards/final_reward": 1.6156733867945872, "rewards/mask_iou_reward": 0.8078366933972936, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.378178596496582, "rewards/thk_ans_format_reward": 1.0, "step": 192, "think_completion_length": 65.03125 }, { "clip_ratio": 0.0, "completion_length": 126.625, "epoch": 0.3254637436762226, "grad_norm": 19.7363508642686, "kl": 0.1484375, "learning_rate": 9.349072512647555e-07, "loss": 0.0001, "reward": 2.5394753217697144, "reward_std": 0.15710114687681198, "rewards/final_reward": 0.2791865142332825, "rewards/mask_iou_reward": 0.13959325711664125, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5394753515720367, "rewards/thk_ans_format_reward": 1.0, "step": 193, "think_completion_length": 53.8125 }, { "clip_ratio": 0.0, "completion_length": 140.828125, "epoch": 0.327150084317032, "grad_norm": 4.867914820040121, "kl": 0.15283203125, "learning_rate": 9.345699831365936e-07, "loss": 0.0002, "reward": 3.283694863319397, "reward_std": 0.3259827569127083, "rewards/final_reward": 1.0945420370292709, "rewards/mask_iou_reward": 0.5472710185146354, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.283694863319397, "rewards/thk_ans_format_reward": 1.0, "step": 194, "think_completion_length": 56.5625 }, { "clip_ratio": 0.0, "completion_length": 157.84375, "epoch": 0.3288364249578415, "grad_norm": 8.986367787783225, "kl": 0.14794921875, "learning_rate": 9.342327150084317e-07, "loss": 0.0001, "reward": 3.0662416219711304, "reward_std": 0.5097359418869019, "rewards/final_reward": 1.0575091683178994, "rewards/mask_iou_reward": 0.5287545841589497, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0662416219711304, "rewards/thk_ans_format_reward": 1.0, "step": 195, "think_completion_length": 68.28125 }, { "clip_ratio": 0.0, "completion_length": 125.96875, "epoch": 0.3305227655986509, "grad_norm": 4.458643815289693, "kl": 0.17822265625, "learning_rate": 9.338954468802697e-07, "loss": 0.0002, "reward": 2.6947516202926636, "reward_std": 0.3916451036930084, "rewards/final_reward": 0.6320579136922702, "rewards/mask_iou_reward": 0.3160289568461351, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6947516202926636, "rewards/thk_ans_format_reward": 1.0, "step": 196, "think_completion_length": 61.96875 }, { "clip_ratio": 0.0, "completion_length": 144.296875, "epoch": 0.33220910623946037, "grad_norm": 8.627113179501139, "kl": 0.124267578125, "learning_rate": 9.335581787521078e-07, "loss": 0.0001, "reward": 2.435835838317871, "reward_std": 0.2502327188849449, "rewards/final_reward": 0.8284848647940978, "rewards/mask_iou_reward": 0.4142424323970489, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.43583589792251587, "rewards/thk_ans_format_reward": 1.0, "step": 197, "think_completion_length": 66.15625 }, { "clip_ratio": 0.0, "completion_length": 137.484375, "epoch": 0.33389544688026984, "grad_norm": 10.433763299061711, "kl": 0.1923828125, "learning_rate": 9.33220910623946e-07, "loss": 0.0002, "reward": 2.810659885406494, "reward_std": 0.2830319292843342, "rewards/final_reward": 1.0020295015503957, "rewards/mask_iou_reward": 0.5010147507751979, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8106599748134613, "rewards/thk_ans_format_reward": 1.0, "step": 198, "think_completion_length": 65.9375 }, { "clip_ratio": 0.0, "completion_length": 149.296875, "epoch": 0.33558178752107926, "grad_norm": 4.186736206358417, "kl": 0.2060546875, "learning_rate": 9.328836424957841e-07, "loss": 0.0002, "reward": 2.5021262168884277, "reward_std": 0.4148203581571579, "rewards/final_reward": 1.0089048969836152, "rewards/mask_iou_reward": 0.5044524484918076, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.5333761423826218, "rewards/thk_ans_format_reward": 0.984375, "step": 199, "think_completion_length": 68.34375 }, { "clip_ratio": 0.0, "completion_length": 148.90625, "epoch": 0.3372681281618887, "grad_norm": 9.22747285482429, "kl": 0.16259765625, "learning_rate": 9.325463743676222e-07, "loss": 0.0002, "reward": 3.107977509498596, "reward_std": 0.3112121522426605, "rewards/final_reward": 1.162340815426343, "rewards/mask_iou_reward": 0.5811704077131715, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1079775094985962, "rewards/thk_ans_format_reward": 1.0, "step": 200, "think_completion_length": 77.28125 }, { "clip_ratio": 0.0, "completion_length": 135.875, "epoch": 0.33895446880269814, "grad_norm": 9.889047145437663, "kl": 0.18896484375, "learning_rate": 9.322091062394604e-07, "loss": 0.0002, "reward": 3.412969708442688, "reward_std": 0.457067608833313, "rewards/final_reward": 1.4231911438686162, "rewards/mask_iou_reward": 0.7115955719343081, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4129695892333984, "rewards/thk_ans_format_reward": 1.0, "step": 201, "think_completion_length": 72.65625 }, { "clip_ratio": 0.0, "completion_length": 159.5625, "epoch": 0.3406408094435076, "grad_norm": 11.631469733744547, "kl": 0.17578125, "learning_rate": 9.318718381112985e-07, "loss": 0.0002, "reward": 3.0336802005767822, "reward_std": 0.5069368779659271, "rewards/final_reward": 0.775275597891573, "rewards/mask_iou_reward": 0.3876377989457865, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.0649300515651703, "rewards/thk_ans_format_reward": 0.984375, "step": 202, "think_completion_length": 76.4375 }, { "clip_ratio": 0.0, "completion_length": 166.484375, "epoch": 0.342327150084317, "grad_norm": 4.719467738906433, "kl": 0.19921875, "learning_rate": 9.315345699831365e-07, "loss": 0.0002, "reward": 2.905851364135742, "reward_std": 0.3175694327801466, "rewards/final_reward": 1.1055562593125423, "rewards/mask_iou_reward": 0.5527781296562712, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.9371014535427094, "rewards/thk_ans_format_reward": 0.984375, "step": 203, "think_completion_length": 78.59375 }, { "clip_ratio": 0.0, "completion_length": 148.859375, "epoch": 0.3440134907251265, "grad_norm": 5.397178764490908, "kl": 0.216796875, "learning_rate": 9.311973018549747e-07, "loss": 0.0002, "reward": 3.009737014770508, "reward_std": 0.13770561665296555, "rewards/final_reward": 1.4083112152036812, "rewards/mask_iou_reward": 0.7041556076018406, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.009736955165863, "rewards/thk_ans_format_reward": 1.0, "step": 204, "think_completion_length": 86.3125 }, { "clip_ratio": 0.0, "completion_length": 150.84375, "epoch": 0.3456998313659359, "grad_norm": 3.3032440569737367, "kl": 0.19921875, "learning_rate": 9.308600337268127e-07, "loss": 0.0002, "reward": 2.865081787109375, "reward_std": 0.4096911549568176, "rewards/final_reward": 1.1043083846033233, "rewards/mask_iou_reward": 0.5521541923016616, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8650819063186646, "rewards/thk_ans_format_reward": 1.0, "step": 205, "think_completion_length": 87.375 }, { "clip_ratio": 0.0, "completion_length": 144.140625, "epoch": 0.3473861720067454, "grad_norm": 10.093800490282504, "kl": 0.1943359375, "learning_rate": 9.305227655986508e-07, "loss": 0.0002, "reward": 2.715272068977356, "reward_std": 0.21183528751134872, "rewards/final_reward": 0.6873399231732058, "rewards/mask_iou_reward": 0.3436699615866029, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7152720987796783, "rewards/thk_ans_format_reward": 1.0, "step": 206, "think_completion_length": 74.4375 }, { "clip_ratio": 0.0, "completion_length": 152.171875, "epoch": 0.3490725126475548, "grad_norm": 4.865522460663028, "kl": 0.21435546875, "learning_rate": 9.30185497470489e-07, "loss": 0.0002, "reward": 3.131924629211426, "reward_std": 0.3374939039349556, "rewards/final_reward": 0.5539088155304328, "rewards/mask_iou_reward": 0.2769544077652164, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1319246292114258, "rewards/thk_ans_format_reward": 1.0, "step": 207, "think_completion_length": 85.84375 }, { "clip_ratio": 0.0, "completion_length": 186.703125, "epoch": 0.35075885328836426, "grad_norm": 4.473708487184476, "kl": 0.16455078125, "learning_rate": 9.298482293423271e-07, "loss": 0.0002, "reward": 2.883733034133911, "reward_std": 0.40940313041210175, "rewards/final_reward": 1.436671863940748, "rewards/mask_iou_reward": 0.718335931970374, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.9149829745292664, "rewards/thk_ans_format_reward": 0.984375, "step": 208, "think_completion_length": 97.15625 }, { "clip_ratio": 0.0, "completion_length": 198.921875, "epoch": 0.3524451939291737, "grad_norm": 23.788673448569757, "kl": 0.18408203125, "learning_rate": 9.295109612141652e-07, "loss": 0.0002, "reward": 2.941943883895874, "reward_std": 0.4504036456346512, "rewards/final_reward": 0.470037845282397, "rewards/mask_iou_reward": 0.2350189226411985, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 1.0044439435005188, "rewards/thk_ans_format_reward": 0.96875, "step": 209, "think_completion_length": 75.71875 }, { "clip_ratio": 0.0, "completion_length": 176.328125, "epoch": 0.35413153456998314, "grad_norm": 5.372868142835202, "kl": 0.24853515625, "learning_rate": 9.291736930860034e-07, "loss": 0.0002, "reward": 2.724569797515869, "reward_std": 0.41185761988162994, "rewards/final_reward": 0.7104813213657171, "rewards/mask_iou_reward": 0.35524066068285853, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.7558198869228363, "rewards/thk_ans_format_reward": 0.984375, "step": 210, "think_completion_length": 71.375 }, { "clip_ratio": 0.0, "completion_length": 138.265625, "epoch": 0.35581787521079256, "grad_norm": 7.693384240552429, "kl": 0.23828125, "learning_rate": 9.288364249578415e-07, "loss": 0.0002, "reward": 3.142953395843506, "reward_std": 0.4350839853286743, "rewards/final_reward": 1.0316405850495944, "rewards/mask_iou_reward": 0.5158202925247972, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1429533958435059, "rewards/thk_ans_format_reward": 1.0, "step": 211, "think_completion_length": 67.46875 }, { "clip_ratio": 0.0, "completion_length": 159.078125, "epoch": 0.357504215851602, "grad_norm": 24.52565671704594, "kl": 0.189453125, "learning_rate": 9.284991568296796e-07, "loss": 0.0002, "reward": 3.507409691810608, "reward_std": 0.2794800400733948, "rewards/final_reward": 1.511632496053278, "rewards/mask_iou_reward": 0.755816248026639, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5074098706245422, "rewards/thk_ans_format_reward": 1.0, "step": 212, "think_completion_length": 76.40625 }, { "clip_ratio": 0.0, "completion_length": 159.875, "epoch": 0.3591905564924115, "grad_norm": 104.11907087866024, "kl": 0.2138671875, "learning_rate": 9.281618887015177e-07, "loss": 0.0002, "reward": 3.1027177572250366, "reward_std": 0.3239024728536606, "rewards/final_reward": 1.0709143355153474, "rewards/mask_iou_reward": 0.5354571677576737, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1027177274227142, "rewards/thk_ans_format_reward": 1.0, "step": 213, "think_completion_length": 76.0625 }, { "clip_ratio": 0.0, "completion_length": 153.84375, "epoch": 0.3608768971332209, "grad_norm": 6.079891869406322, "kl": 0.22998046875, "learning_rate": 9.278246205733557e-07, "loss": 0.0002, "reward": 3.1863114833831787, "reward_std": 0.35681121051311493, "rewards/final_reward": 1.0386986981609376, "rewards/mask_iou_reward": 0.5193493490804688, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1863115429878235, "rewards/thk_ans_format_reward": 1.0, "step": 214, "think_completion_length": 81.0625 }, { "clip_ratio": 0.0, "completion_length": 148.015625, "epoch": 0.3625632377740304, "grad_norm": 20.992549608832288, "kl": 0.18701171875, "learning_rate": 9.274873524451939e-07, "loss": 0.0002, "reward": 3.2150684595108032, "reward_std": 0.3001432493329048, "rewards/final_reward": 1.4833759696702087, "rewards/mask_iou_reward": 0.7416879848351043, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2150683999061584, "rewards/thk_ans_format_reward": 1.0, "step": 215, "think_completion_length": 77.59375 }, { "clip_ratio": 0.0, "completion_length": 149.15625, "epoch": 0.3642495784148398, "grad_norm": 3.985537844159363, "kl": 0.21484375, "learning_rate": 9.27150084317032e-07, "loss": 0.0002, "reward": 2.9944499731063843, "reward_std": 0.40752771496772766, "rewards/final_reward": 0.9913912240867233, "rewards/mask_iou_reward": 0.49569561204336166, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9944499731063843, "rewards/thk_ans_format_reward": 1.0, "step": 216, "think_completion_length": 75.21875 }, { "clip_ratio": 0.0, "completion_length": 149.90625, "epoch": 0.36593591905564926, "grad_norm": 12.055847835960044, "kl": 0.2060546875, "learning_rate": 9.268128161888701e-07, "loss": 0.0002, "reward": 3.2611602544784546, "reward_std": 0.2674819827079773, "rewards/final_reward": 1.1373292199253833, "rewards/mask_iou_reward": 0.5686646099626916, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2611603140830994, "rewards/thk_ans_format_reward": 1.0, "step": 217, "think_completion_length": 75.9375 }, { "clip_ratio": 0.0, "completion_length": 144.421875, "epoch": 0.3676222596964587, "grad_norm": 4.2602635786354535, "kl": 0.208984375, "learning_rate": 9.264755480607083e-07, "loss": 0.0002, "reward": 2.6974871158599854, "reward_std": 0.3205343186855316, "rewards/final_reward": 0.19768463854294763, "rewards/mask_iou_reward": 0.09884231927147381, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6974871754646301, "rewards/thk_ans_format_reward": 1.0, "step": 218, "think_completion_length": 82.34375 }, { "clip_ratio": 0.0, "completion_length": 152.703125, "epoch": 0.36930860033726814, "grad_norm": 5.154948821299118, "kl": 0.20458984375, "learning_rate": 9.261382799325464e-07, "loss": 0.0002, "reward": 3.2684760093688965, "reward_std": 0.37709657847881317, "rewards/final_reward": 1.0383823792548679, "rewards/mask_iou_reward": 0.5191911896274339, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2684761881828308, "rewards/thk_ans_format_reward": 1.0, "step": 219, "think_completion_length": 86.4375 }, { "clip_ratio": 0.0, "completion_length": 131.546875, "epoch": 0.37099494097807756, "grad_norm": 10.042230308759423, "kl": 0.22021484375, "learning_rate": 9.258010118043844e-07, "loss": 0.0002, "reward": 2.912859559059143, "reward_std": 0.18924781680107117, "rewards/final_reward": 0.8126117230948996, "rewards/mask_iou_reward": 0.4063058615474498, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9128594994544983, "rewards/thk_ans_format_reward": 1.0, "step": 220, "think_completion_length": 69.3125 }, { "clip_ratio": 0.0, "completion_length": 147.25, "epoch": 0.37268128161888703, "grad_norm": 5.580949729321493, "kl": 0.21826171875, "learning_rate": 9.254637436762226e-07, "loss": 0.0002, "reward": 2.6421183347702026, "reward_std": 0.21327205747365952, "rewards/final_reward": 0.38601525011759963, "rewards/mask_iou_reward": 0.19300762505879981, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6421183943748474, "rewards/thk_ans_format_reward": 1.0, "step": 221, "think_completion_length": 69.65625 }, { "clip_ratio": 0.0, "completion_length": 150.828125, "epoch": 0.37436762225969644, "grad_norm": 4.732336023491863, "kl": 0.26611328125, "learning_rate": 9.251264755480606e-07, "loss": 0.0003, "reward": 3.230614185333252, "reward_std": 0.29589555226266384, "rewards/final_reward": 1.5953083083239932, "rewards/mask_iou_reward": 0.7976541541619966, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2306141257286072, "rewards/thk_ans_format_reward": 1.0, "step": 222, "think_completion_length": 68.8125 }, { "clip_ratio": 0.0, "completion_length": 149.421875, "epoch": 0.3760539629005059, "grad_norm": 18.74043446734351, "kl": 0.197265625, "learning_rate": 9.247892074198987e-07, "loss": 0.0002, "reward": 2.9248836040496826, "reward_std": 0.1982583925127983, "rewards/final_reward": 1.5204117591821684, "rewards/mask_iou_reward": 0.7602058795910842, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9248837232589722, "rewards/thk_ans_format_reward": 1.0, "step": 223, "think_completion_length": 82.375 }, { "clip_ratio": 0.0, "completion_length": 141.203125, "epoch": 0.3777403035413153, "grad_norm": 4.867690589893153, "kl": 0.2177734375, "learning_rate": 9.244519392917369e-07, "loss": 0.0002, "reward": 2.6547787189483643, "reward_std": 0.2125616818666458, "rewards/final_reward": 0.030600702763538143, "rewards/mask_iou_reward": 0.015300351381769071, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6547787487506866, "rewards/thk_ans_format_reward": 1.0, "step": 224, "think_completion_length": 81.1875 }, { "clip_ratio": 0.0, "completion_length": 162.015625, "epoch": 0.3794266441821248, "grad_norm": 5.204370347747985, "kl": 0.22021484375, "learning_rate": 9.24114671163575e-07, "loss": 0.0002, "reward": 3.1091920137405396, "reward_std": 0.3276357799768448, "rewards/final_reward": 1.1539964994728786, "rewards/mask_iou_reward": 0.5769982497364393, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1091918349266052, "rewards/thk_ans_format_reward": 1.0, "step": 225, "think_completion_length": 76.0 }, { "clip_ratio": 0.0, "completion_length": 155.40625, "epoch": 0.3811129848229342, "grad_norm": 7.562543453423681, "kl": 0.205078125, "learning_rate": 9.237774030354131e-07, "loss": 0.0002, "reward": 3.0599652528762817, "reward_std": 0.5095875263214111, "rewards/final_reward": 0.9059710345682561, "rewards/mask_iou_reward": 0.45298551728412806, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.059965193271637, "rewards/thk_ans_format_reward": 1.0, "step": 226, "think_completion_length": 68.28125 }, { "clip_ratio": 0.0, "completion_length": 139.65625, "epoch": 0.3827993254637437, "grad_norm": 22.849487722280188, "kl": 0.2255859375, "learning_rate": 9.234401349072513e-07, "loss": 0.0002, "reward": 3.07417631149292, "reward_std": 0.37541690468788147, "rewards/final_reward": 1.455683295385246, "rewards/mask_iou_reward": 0.727841647692623, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0741761326789856, "rewards/thk_ans_format_reward": 1.0, "step": 227, "think_completion_length": 66.375 }, { "clip_ratio": 0.0, "completion_length": 169.921875, "epoch": 0.3844856661045531, "grad_norm": 3.2900971059049224, "kl": 0.19873046875, "learning_rate": 9.231028667790893e-07, "loss": 0.0002, "reward": 3.077646255493164, "reward_std": 0.4934057295322418, "rewards/final_reward": 1.4828937291881843, "rewards/mask_iou_reward": 0.7414468645940921, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.1088961362838745, "rewards/thk_ans_format_reward": 0.984375, "step": 228, "think_completion_length": 90.5625 }, { "clip_ratio": 0.0, "completion_length": 142.546875, "epoch": 0.38617200674536256, "grad_norm": 6.725551609246913, "kl": 0.21240234375, "learning_rate": 9.227655986509274e-07, "loss": 0.0002, "reward": 3.021067500114441, "reward_std": 0.21517714858055115, "rewards/final_reward": 1.1519169989098978, "rewards/mask_iou_reward": 0.5759584994549489, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0210675299167633, "rewards/thk_ans_format_reward": 1.0, "step": 229, "think_completion_length": 70.34375 }, { "clip_ratio": 0.0, "completion_length": 149.734375, "epoch": 0.38785834738617203, "grad_norm": 4.0711352651747195, "kl": 0.205078125, "learning_rate": 9.224283305227656e-07, "loss": 0.0002, "reward": 2.8537700176239014, "reward_std": 0.37378963828086853, "rewards/final_reward": 0.6391967309039653, "rewards/mask_iou_reward": 0.31959836545198267, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8537698686122894, "rewards/thk_ans_format_reward": 1.0, "step": 230, "think_completion_length": 85.21875 }, { "clip_ratio": 0.0, "completion_length": 135.625, "epoch": 0.38954468802698144, "grad_norm": 7.15848584277918, "kl": 0.216796875, "learning_rate": 9.220910623946036e-07, "loss": 0.0002, "reward": 3.125742197036743, "reward_std": 0.26193077489733696, "rewards/final_reward": 1.075020571264583, "rewards/mask_iou_reward": 0.5375102856322915, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.125742256641388, "rewards/thk_ans_format_reward": 1.0, "step": 231, "think_completion_length": 63.3125 }, { "clip_ratio": 0.0, "completion_length": 146.59375, "epoch": 0.3912310286677909, "grad_norm": 7.744629541689001, "kl": 0.333984375, "learning_rate": 9.217537942664417e-07, "loss": 0.0003, "reward": 3.0868237018585205, "reward_std": 0.41837984323501587, "rewards/final_reward": 0.9561375750120857, "rewards/mask_iou_reward": 0.47806878750604287, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.08682382106781, "rewards/thk_ans_format_reward": 1.0, "step": 232, "think_completion_length": 73.34375 }, { "clip_ratio": 0.0, "completion_length": 139.453125, "epoch": 0.39291736930860033, "grad_norm": 8.404435287451914, "kl": 0.212890625, "learning_rate": 9.214165261382799e-07, "loss": 0.0002, "reward": 2.928203582763672, "reward_std": 0.4975929260253906, "rewards/final_reward": 0.5828971869005508, "rewards/mask_iou_reward": 0.2914485934502754, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9282036423683167, "rewards/thk_ans_format_reward": 1.0, "step": 233, "think_completion_length": 65.0 }, { "clip_ratio": 0.0, "completion_length": 179.96875, "epoch": 0.3946037099494098, "grad_norm": 14.476592822333908, "kl": 0.19140625, "learning_rate": 9.21079258010118e-07, "loss": 0.0002, "reward": 2.635561227798462, "reward_std": 0.32975369691848755, "rewards/final_reward": 0.9387983024693395, "rewards/mask_iou_reward": 0.46939915123466974, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6355613172054291, "rewards/thk_ans_format_reward": 1.0, "step": 234, "think_completion_length": 80.21875 }, { "clip_ratio": 0.0, "completion_length": 145.765625, "epoch": 0.3962900505902192, "grad_norm": 5.813593781329736, "kl": 0.24169921875, "learning_rate": 9.207419898819561e-07, "loss": 0.0002, "reward": 3.0836466550827026, "reward_std": 0.23019906878471375, "rewards/final_reward": 1.218153597393058, "rewards/mask_iou_reward": 0.609076798696529, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0836465060710907, "rewards/thk_ans_format_reward": 1.0, "step": 235, "think_completion_length": 81.59375 }, { "clip_ratio": 0.0, "completion_length": 133.859375, "epoch": 0.3979763912310287, "grad_norm": 7.446504853574746, "kl": 0.2236328125, "learning_rate": 9.204047217537943e-07, "loss": 0.0002, "reward": 2.911117911338806, "reward_std": 0.2473655566573143, "rewards/final_reward": 0.6911036456671604, "rewards/mask_iou_reward": 0.3455518228335802, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9111178815364838, "rewards/thk_ans_format_reward": 1.0, "step": 236, "think_completion_length": 58.875 }, { "clip_ratio": 0.0, "completion_length": 137.046875, "epoch": 0.3996627318718381, "grad_norm": 6.320005332811707, "kl": 0.2216796875, "learning_rate": 9.200674536256323e-07, "loss": 0.0002, "reward": 2.954068422317505, "reward_std": 0.4168316461145878, "rewards/final_reward": 0.639526934138163, "rewards/mask_iou_reward": 0.3197634670690815, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9540683627128601, "rewards/thk_ans_format_reward": 1.0, "step": 237, "think_completion_length": 71.75 }, { "clip_ratio": 0.0, "completion_length": 137.09375, "epoch": 0.40134907251264756, "grad_norm": 5.998901795263871, "kl": 0.26318359375, "learning_rate": 9.197301854974705e-07, "loss": 0.0003, "reward": 3.161041498184204, "reward_std": 0.17126264609396458, "rewards/final_reward": 0.8910959114004183, "rewards/mask_iou_reward": 0.44554795570020916, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.161041498184204, "rewards/thk_ans_format_reward": 1.0, "step": 238, "think_completion_length": 57.25 }, { "clip_ratio": 0.0, "completion_length": 136.328125, "epoch": 0.403035413153457, "grad_norm": 7.6852691733431655, "kl": 0.22314453125, "learning_rate": 9.193929173693086e-07, "loss": 0.0002, "reward": 2.8813143968582153, "reward_std": 0.2463463842868805, "rewards/final_reward": 1.1477729166412296, "rewards/mask_iou_reward": 0.5738864583206148, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8813144266605377, "rewards/thk_ans_format_reward": 1.0, "step": 239, "think_completion_length": 68.625 }, { "clip_ratio": 0.0, "completion_length": 136.5, "epoch": 0.40472175379426645, "grad_norm": 5.47140326939557, "kl": 0.3115234375, "learning_rate": 9.190556492411466e-07, "loss": 0.0003, "reward": 2.9627087116241455, "reward_std": 0.45273715257644653, "rewards/final_reward": 1.3423065764411553, "rewards/mask_iou_reward": 0.6711532882205776, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9627088606357574, "rewards/thk_ans_format_reward": 1.0, "step": 240, "think_completion_length": 67.96875 }, { "clip_ratio": 0.0, "completion_length": 137.34375, "epoch": 0.40640809443507586, "grad_norm": 19.829125482202834, "kl": 0.826171875, "learning_rate": 9.187183811129848e-07, "loss": 0.0008, "reward": 2.9754170179367065, "reward_std": 0.4061010330915451, "rewards/final_reward": 1.052455780557629, "rewards/mask_iou_reward": 0.5262278902788144, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9754170775413513, "rewards/thk_ans_format_reward": 1.0, "step": 241, "think_completion_length": 72.5 }, { "clip_ratio": 0.0, "completion_length": 140.15625, "epoch": 0.40809443507588533, "grad_norm": 28.243128773381454, "kl": 0.2041015625, "learning_rate": 9.183811129848229e-07, "loss": 0.0002, "reward": 3.6813935041427612, "reward_std": 0.1992366872727871, "rewards/final_reward": 1.8080672760079621, "rewards/mask_iou_reward": 0.9040336380039811, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6813934445381165, "rewards/thk_ans_format_reward": 1.0, "step": 242, "think_completion_length": 72.1875 }, { "clip_ratio": 0.0, "completion_length": 143.359375, "epoch": 0.40978077571669475, "grad_norm": 140.06539665595105, "kl": 0.2353515625, "learning_rate": 9.18043844856661e-07, "loss": 0.0002, "reward": 3.1565033197402954, "reward_std": 0.4416676461696625, "rewards/final_reward": 1.2314951954134759, "rewards/mask_iou_reward": 0.6157475977067379, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1565033495426178, "rewards/thk_ans_format_reward": 1.0, "step": 243, "think_completion_length": 75.40625 }, { "clip_ratio": 0.0, "completion_length": 145.609375, "epoch": 0.4114671163575042, "grad_norm": 7.392254590000325, "kl": 0.23388671875, "learning_rate": 9.177065767284992e-07, "loss": 0.0002, "reward": 2.6299526691436768, "reward_std": 0.2196236252784729, "rewards/final_reward": 0.4655239442586937, "rewards/mask_iou_reward": 0.23276197212934685, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6299527883529663, "rewards/thk_ans_format_reward": 1.0, "step": 244, "think_completion_length": 72.96875 }, { "clip_ratio": 0.0, "completion_length": 132.015625, "epoch": 0.4131534569983137, "grad_norm": 26.232242488237784, "kl": 0.24609375, "learning_rate": 9.173693086003372e-07, "loss": 0.0002, "reward": 2.877917170524597, "reward_std": 0.301144540309906, "rewards/final_reward": 0.19570692172312354, "rewards/mask_iou_reward": 0.09785346086156177, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8779171109199524, "rewards/thk_ans_format_reward": 1.0, "step": 245, "think_completion_length": 72.15625 }, { "clip_ratio": 0.0, "completion_length": 146.609375, "epoch": 0.4148397976391231, "grad_norm": 5.983543827094884, "kl": 0.25146484375, "learning_rate": 9.170320404721753e-07, "loss": 0.0003, "reward": 3.2205491065979004, "reward_std": 0.1830149181187153, "rewards/final_reward": 1.852683062161943, "rewards/mask_iou_reward": 0.9263415310809715, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2361740469932556, "rewards/thk_ans_format_reward": 0.984375, "step": 246, "think_completion_length": 66.625 }, { "clip_ratio": 0.0, "completion_length": 139.734375, "epoch": 0.41652613827993257, "grad_norm": 7.9257714065232685, "kl": 0.2314453125, "learning_rate": 9.166947723440135e-07, "loss": 0.0002, "reward": 3.2331912517547607, "reward_std": 0.33761420100927353, "rewards/final_reward": 1.2358968368078678, "rewards/mask_iou_reward": 0.6179484184039339, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2331913709640503, "rewards/thk_ans_format_reward": 1.0, "step": 247, "think_completion_length": 76.90625 }, { "clip_ratio": 0.0, "completion_length": 139.328125, "epoch": 0.418212478920742, "grad_norm": 5.404389365456824, "kl": 0.22119140625, "learning_rate": 9.163575042158516e-07, "loss": 0.0002, "reward": 2.731718420982361, "reward_std": 0.15819299221038818, "rewards/final_reward": 0.7831214945094458, "rewards/mask_iou_reward": 0.3915607472547229, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.7473434042185545, "rewards/thk_ans_format_reward": 1.0, "step": 248, "think_completion_length": 67.59375 }, { "clip_ratio": 0.0, "completion_length": 142.84375, "epoch": 0.41989881956155145, "grad_norm": 6.896631453177303, "kl": 0.24755859375, "learning_rate": 9.160202360876896e-07, "loss": 0.0002, "reward": 3.7351213693618774, "reward_std": 0.15707053616642952, "rewards/final_reward": 1.690610043355829, "rewards/mask_iou_reward": 0.8453050216779145, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7351213693618774, "rewards/thk_ans_format_reward": 1.0, "step": 249, "think_completion_length": 76.84375 }, { "clip_ratio": 0.0, "completion_length": 147.53125, "epoch": 0.42158516020236086, "grad_norm": 11.797769654791798, "kl": 0.24560546875, "learning_rate": 9.156829679595278e-07, "loss": 0.0002, "reward": 3.319765567779541, "reward_std": 0.22760100662708282, "rewards/final_reward": 1.43913601584751, "rewards/mask_iou_reward": 0.719568007923755, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.319765418767929, "rewards/thk_ans_format_reward": 1.0, "step": 250, "think_completion_length": 67.65625 }, { "clip_ratio": 0.0, "completion_length": 139.953125, "epoch": 0.42327150084317033, "grad_norm": 4.103837453753516, "kl": 0.228515625, "learning_rate": 9.153456998313659e-07, "loss": 0.0002, "reward": 3.584295630455017, "reward_std": 0.15136371925473213, "rewards/final_reward": 1.371311237407304, "rewards/mask_iou_reward": 0.685655618703652, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.599920630455017, "rewards/thk_ans_format_reward": 0.984375, "step": 251, "think_completion_length": 79.0625 }, { "clip_ratio": 0.0, "completion_length": 134.96875, "epoch": 0.42495784148397975, "grad_norm": 3.4126528618942507, "kl": 0.244140625, "learning_rate": 9.15008431703204e-07, "loss": 0.0002, "reward": 3.032737374305725, "reward_std": 0.218122199177742, "rewards/final_reward": 0.9424829789653526, "rewards/mask_iou_reward": 0.4712414894826763, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.032737284898758, "rewards/thk_ans_format_reward": 1.0, "step": 252, "think_completion_length": 72.8125 }, { "clip_ratio": 0.0, "completion_length": 159.078125, "epoch": 0.4266441821247892, "grad_norm": 4.673604751065417, "kl": 0.265625, "learning_rate": 9.146711635750421e-07, "loss": 0.0003, "reward": 2.7446590662002563, "reward_std": 0.311612606048584, "rewards/final_reward": 0.6923241188788728, "rewards/mask_iou_reward": 0.3461620594394364, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7446590214967728, "rewards/thk_ans_format_reward": 1.0, "step": 253, "think_completion_length": 81.625 }, { "clip_ratio": 0.0, "completion_length": 148.9375, "epoch": 0.42833052276559863, "grad_norm": 4.250104365560061, "kl": 0.23486328125, "learning_rate": 9.143338954468802e-07, "loss": 0.0002, "reward": 3.027690887451172, "reward_std": 0.4209420531988144, "rewards/final_reward": 0.9340073633977319, "rewards/mask_iou_reward": 0.46700368169886597, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.02769073843956, "rewards/thk_ans_format_reward": 1.0, "step": 254, "think_completion_length": 71.34375 }, { "clip_ratio": 0.0, "completion_length": 139.265625, "epoch": 0.4300168634064081, "grad_norm": 4.1133314139001, "kl": 0.23876953125, "learning_rate": 9.139966273187183e-07, "loss": 0.0002, "reward": 3.125454545021057, "reward_std": 0.4221559911966324, "rewards/final_reward": 1.0887862009263054, "rewards/mask_iou_reward": 0.5443931004631527, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1254545748233795, "rewards/thk_ans_format_reward": 1.0, "step": 255, "think_completion_length": 71.75 }, { "clip_ratio": 0.0, "completion_length": 143.75, "epoch": 0.4317032040472175, "grad_norm": 10.371535649498034, "kl": 0.21533203125, "learning_rate": 9.136593591905565e-07, "loss": 0.0002, "reward": 3.276609420776367, "reward_std": 0.35103708505630493, "rewards/final_reward": 1.1444245548335776, "rewards/mask_iou_reward": 0.5722122774167888, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2766093015670776, "rewards/thk_ans_format_reward": 1.0, "step": 256, "think_completion_length": 73.65625 }, { "clip_ratio": 0.0, "completion_length": 158.125, "epoch": 0.433389544688027, "grad_norm": 5.379368798881943, "kl": 0.41845703125, "learning_rate": 9.133220910623946e-07, "loss": 0.0004, "reward": 2.9322704076766968, "reward_std": 0.2937382832169533, "rewards/final_reward": 0.9810359015544533, "rewards/mask_iou_reward": 0.49051795077722665, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9478955268859863, "rewards/thk_ans_format_reward": 0.984375, "step": 257, "think_completion_length": 79.875 }, { "clip_ratio": 0.0, "completion_length": 143.375, "epoch": 0.4350758853288364, "grad_norm": 4.0113801797767, "kl": 0.22900390625, "learning_rate": 9.129848229342326e-07, "loss": 0.0005, "reward": 2.6997172832489014, "reward_std": 0.1351064220070839, "rewards/final_reward": 0.4088354568064064, "rewards/mask_iou_reward": 0.2044177284032032, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6997173428535461, "rewards/thk_ans_format_reward": 1.0, "step": 258, "think_completion_length": 81.40625 }, { "clip_ratio": 0.0, "completion_length": 141.25, "epoch": 0.43676222596964587, "grad_norm": 4.619601971068037, "kl": 0.4501953125, "learning_rate": 9.126475548060708e-07, "loss": 0.0004, "reward": 3.021946907043457, "reward_std": 0.17659004405140877, "rewards/final_reward": 1.3774581014446288, "rewards/mask_iou_reward": 0.6887290507223144, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0219468921422958, "rewards/thk_ans_format_reward": 1.0, "step": 259, "think_completion_length": 81.0 }, { "clip_ratio": 0.0, "completion_length": 160.484375, "epoch": 0.43844856661045534, "grad_norm": 3.9974465215683224, "kl": 0.2001953125, "learning_rate": 9.123102866779089e-07, "loss": 0.0002, "reward": 3.290497064590454, "reward_std": 0.27161210775375366, "rewards/final_reward": 1.60521735437425, "rewards/mask_iou_reward": 0.802608677187125, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2904971241950989, "rewards/thk_ans_format_reward": 1.0, "step": 260, "think_completion_length": 83.90625 }, { "clip_ratio": 0.0, "completion_length": 147.015625, "epoch": 0.44013490725126475, "grad_norm": 4.270684837247345, "kl": 0.2353515625, "learning_rate": 9.119730185497469e-07, "loss": 0.0002, "reward": 2.9826985597610474, "reward_std": 0.27017855644226074, "rewards/final_reward": 0.9011500105943151, "rewards/mask_iou_reward": 0.45057500529715755, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9826985597610474, "rewards/thk_ans_format_reward": 1.0, "step": 261, "think_completion_length": 82.09375 }, { "clip_ratio": 0.0, "completion_length": 154.46875, "epoch": 0.4418212478920742, "grad_norm": 5.01938701583992, "kl": 0.2294921875, "learning_rate": 9.116357504215851e-07, "loss": 0.0002, "reward": 2.971551537513733, "reward_std": 0.17417415231466293, "rewards/final_reward": 0.3612003790476864, "rewards/mask_iou_reward": 0.1806001895238432, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9715515375137329, "rewards/thk_ans_format_reward": 1.0, "step": 262, "think_completion_length": 91.90625 }, { "clip_ratio": 0.0, "completion_length": 177.203125, "epoch": 0.44350758853288363, "grad_norm": 6.835284114070682, "kl": 0.2275390625, "learning_rate": 9.112984822934232e-07, "loss": 0.0002, "reward": 3.203929901123047, "reward_std": 0.32439279556274414, "rewards/final_reward": 1.011724169976494, "rewards/mask_iou_reward": 0.505862084988247, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2195549309253693, "rewards/thk_ans_format_reward": 0.984375, "step": 263, "think_completion_length": 87.90625 }, { "clip_ratio": 0.0, "completion_length": 193.453125, "epoch": 0.4451939291736931, "grad_norm": 8.018046696171343, "kl": 0.23388671875, "learning_rate": 9.109612141652614e-07, "loss": 0.0002, "reward": 2.3503458499908447, "reward_std": 0.22059325873851776, "rewards/final_reward": 0.5475854655181758, "rewards/mask_iou_reward": 0.2737927327590879, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.35034577548503876, "rewards/thk_ans_format_reward": 1.0, "step": 264, "think_completion_length": 87.625 }, { "clip_ratio": 0.0, "completion_length": 163.5625, "epoch": 0.4468802698145025, "grad_norm": 5.207261902627235, "kl": 0.203125, "learning_rate": 9.106239460370995e-07, "loss": 0.0002, "reward": 3.234385371208191, "reward_std": 0.2574783265590668, "rewards/final_reward": 1.7540360878854586, "rewards/mask_iou_reward": 0.8770180439427293, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2343851923942566, "rewards/thk_ans_format_reward": 1.0, "step": 265, "think_completion_length": 97.34375 }, { "clip_ratio": 0.0, "completion_length": 177.828125, "epoch": 0.448566610455312, "grad_norm": 4.894515919476582, "kl": 0.1962890625, "learning_rate": 9.102866779089376e-07, "loss": 0.0002, "reward": 3.1356217861175537, "reward_std": 0.3286105990409851, "rewards/final_reward": 1.07730285055642, "rewards/mask_iou_reward": 0.53865142527821, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1356218457221985, "rewards/thk_ans_format_reward": 1.0, "step": 266, "think_completion_length": 107.34375 }, { "clip_ratio": 0.0, "completion_length": 197.28125, "epoch": 0.4502529510961214, "grad_norm": 5.429602045870938, "kl": 0.1943359375, "learning_rate": 9.099494097807757e-07, "loss": 0.0002, "reward": 2.8694194555282593, "reward_std": 0.41457006335258484, "rewards/final_reward": 0.9558318417611935, "rewards/mask_iou_reward": 0.47791592088059676, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.9006692469120026, "rewards/thk_ans_format_reward": 0.984375, "step": 267, "think_completion_length": 101.65625 }, { "clip_ratio": 0.0, "completion_length": 181.890625, "epoch": 0.45193929173693087, "grad_norm": 12.128411701219036, "kl": 0.22998046875, "learning_rate": 9.096121416526138e-07, "loss": 0.0002, "reward": 2.6065198183059692, "reward_std": 0.37309861183166504, "rewards/final_reward": 0.198079350131303, "rewards/mask_iou_reward": 0.0990396750656515, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6065197587013245, "rewards/thk_ans_format_reward": 1.0, "step": 268, "think_completion_length": 105.8125 }, { "clip_ratio": 0.0, "completion_length": 184.828125, "epoch": 0.4536256323777403, "grad_norm": 5.932760228161688, "kl": 0.1904296875, "learning_rate": 9.092748735244519e-07, "loss": 0.0002, "reward": 2.5283087491989136, "reward_std": 0.5122461318969727, "rewards/final_reward": 0.5003546532870379, "rewards/mask_iou_reward": 0.25017732664351894, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5283087491989136, "rewards/thk_ans_format_reward": 1.0, "step": 269, "think_completion_length": 109.375 }, { "clip_ratio": 0.0, "completion_length": 172.34375, "epoch": 0.45531197301854975, "grad_norm": 3.035312901499431, "kl": 0.20703125, "learning_rate": 9.0893760539629e-07, "loss": 0.0002, "reward": 3.0272092819213867, "reward_std": 0.6400187015533447, "rewards/final_reward": 0.7176652425113891, "rewards/mask_iou_reward": 0.35883262125569454, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0272094011306763, "rewards/thk_ans_format_reward": 1.0, "step": 270, "think_completion_length": 95.3125 }, { "clip_ratio": 0.0, "completion_length": 176.5625, "epoch": 0.45699831365935917, "grad_norm": 3.3097869393690655, "kl": 0.2314453125, "learning_rate": 9.086003372681281e-07, "loss": 0.0002, "reward": 3.161636710166931, "reward_std": 0.40839092433452606, "rewards/final_reward": 0.9878549438773723, "rewards/mask_iou_reward": 0.49392747193868614, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1616367101669312, "rewards/thk_ans_format_reward": 1.0, "step": 271, "think_completion_length": 106.59375 }, { "clip_ratio": 0.0, "completion_length": 176.9375, "epoch": 0.45868465430016864, "grad_norm": 13.954579861181404, "kl": 0.228515625, "learning_rate": 9.082630691399662e-07, "loss": 0.0002, "reward": 3.0976139307022095, "reward_std": 0.33141565322875977, "rewards/final_reward": 0.9411599724049827, "rewards/mask_iou_reward": 0.47057998620249136, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.097613900899887, "rewards/thk_ans_format_reward": 1.0, "step": 272, "think_completion_length": 98.78125 }, { "clip_ratio": 0.0, "completion_length": 181.953125, "epoch": 0.46037099494097805, "grad_norm": 6.285702802859748, "kl": 0.240234375, "learning_rate": 9.079258010118044e-07, "loss": 0.0002, "reward": 2.4280236959457397, "reward_std": 0.24160349369049072, "rewards/final_reward": 0.22331540502332667, "rewards/mask_iou_reward": 0.11165770251166333, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.42802368104457855, "rewards/thk_ans_format_reward": 1.0, "step": 273, "think_completion_length": 109.4375 }, { "clip_ratio": 0.0, "completion_length": 160.828125, "epoch": 0.4620573355817875, "grad_norm": 6.371702634806563, "kl": 0.228515625, "learning_rate": 9.075885328836425e-07, "loss": 0.0002, "reward": 2.871516704559326, "reward_std": 0.31683170795440674, "rewards/final_reward": 1.2616139609735537, "rewards/mask_iou_reward": 0.6308069804867769, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8715166747570038, "rewards/thk_ans_format_reward": 1.0, "step": 274, "think_completion_length": 91.28125 }, { "clip_ratio": 0.0, "completion_length": 180.046875, "epoch": 0.463743676222597, "grad_norm": 9.870553593778212, "kl": 0.2265625, "learning_rate": 9.072512647554806e-07, "loss": 0.0002, "reward": 3.31233549118042, "reward_std": 0.3460022658109665, "rewards/final_reward": 1.3883268676465192, "rewards/mask_iou_reward": 0.6941634338232596, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3123353123664856, "rewards/thk_ans_format_reward": 1.0, "step": 275, "think_completion_length": 118.0625 }, { "clip_ratio": 0.0, "completion_length": 179.4375, "epoch": 0.4654300168634064, "grad_norm": 3.4862830215166274, "kl": 0.2431640625, "learning_rate": 9.069139966273187e-07, "loss": 0.0002, "reward": 2.803930640220642, "reward_std": 0.25581035390496254, "rewards/final_reward": 0.8173206091655374, "rewards/mask_iou_reward": 0.4086603045827687, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8039306998252869, "rewards/thk_ans_format_reward": 1.0, "step": 276, "think_completion_length": 112.90625 }, { "clip_ratio": 0.0, "completion_length": 177.34375, "epoch": 0.4671163575042159, "grad_norm": 3.492125265472315, "kl": 0.19677734375, "learning_rate": 9.065767284991568e-07, "loss": 0.0002, "reward": 3.049085855484009, "reward_std": 0.30213601887226105, "rewards/final_reward": 1.094389741188175, "rewards/mask_iou_reward": 0.5471948705940874, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0490858852863312, "rewards/thk_ans_format_reward": 1.0, "step": 277, "think_completion_length": 109.5 }, { "clip_ratio": 0.0, "completion_length": 189.453125, "epoch": 0.4688026981450253, "grad_norm": 4.262931594041564, "kl": 0.2021484375, "learning_rate": 9.062394603709948e-07, "loss": 0.0002, "reward": 2.6954082250595093, "reward_std": 0.362802118062973, "rewards/final_reward": 0.7194369269474945, "rewards/mask_iou_reward": 0.35971846347374725, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.71103335916996, "rewards/thk_ans_format_reward": 1.0, "step": 278, "think_completion_length": 103.59375 }, { "clip_ratio": 0.0, "completion_length": 195.453125, "epoch": 0.47048903878583476, "grad_norm": 5.514757215166888, "kl": 0.23876953125, "learning_rate": 9.05902192242833e-07, "loss": 0.0002, "reward": 2.784053087234497, "reward_std": 0.2925054356455803, "rewards/final_reward": 1.4111391442364694, "rewards/mask_iou_reward": 0.7055695721182347, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7840530574321747, "rewards/thk_ans_format_reward": 1.0, "step": 279, "think_completion_length": 128.5625 }, { "clip_ratio": 0.0, "completion_length": 192.5, "epoch": 0.47217537942664417, "grad_norm": 3.6758711061332123, "kl": 0.208984375, "learning_rate": 9.055649241146711e-07, "loss": 0.0002, "reward": 3.4984025955200195, "reward_std": 0.3452688194811344, "rewards/final_reward": 1.6300352304452077, "rewards/mask_iou_reward": 0.8150176152226039, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.5296525955200195, "rewards/thk_ans_format_reward": 0.984375, "step": 280, "think_completion_length": 95.75 }, { "clip_ratio": 0.0, "completion_length": 222.890625, "epoch": 0.47386172006745364, "grad_norm": 5.7177998821481335, "kl": 0.2177734375, "learning_rate": 9.052276559865092e-07, "loss": 0.0002, "reward": 2.33639395236969, "reward_std": 0.38313548266887665, "rewards/final_reward": 0.46264121102616823, "rewards/mask_iou_reward": 0.23132060551308412, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 0.39889395236968994, "rewards/thk_ans_format_reward": 0.96875, "step": 281, "think_completion_length": 100.4375 }, { "clip_ratio": 0.0, "completion_length": 185.34375, "epoch": 0.47554806070826305, "grad_norm": 3.138825572604774, "kl": 0.1982421875, "learning_rate": 9.048903878583474e-07, "loss": 0.0002, "reward": 3.0318186283111572, "reward_std": 0.33777186274528503, "rewards/final_reward": 0.937564592938467, "rewards/mask_iou_reward": 0.4687822964692335, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.0630684792995453, "rewards/thk_ans_format_reward": 0.984375, "step": 282, "think_completion_length": 101.40625 }, { "clip_ratio": 0.0, "completion_length": 175.109375, "epoch": 0.4772344013490725, "grad_norm": 6.630513644178006, "kl": 0.20751953125, "learning_rate": 9.045531197301855e-07, "loss": 0.0002, "reward": 2.7624622583389282, "reward_std": 0.2361261248588562, "rewards/final_reward": 0.7597826504116494, "rewards/mask_iou_reward": 0.3798913252058247, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7624624371528625, "rewards/thk_ans_format_reward": 1.0, "step": 283, "think_completion_length": 100.28125 }, { "clip_ratio": 0.0, "completion_length": 196.15625, "epoch": 0.47892074198988194, "grad_norm": 11.039690750084555, "kl": 0.2099609375, "learning_rate": 9.042158516020235e-07, "loss": 0.0002, "reward": 3.314423441886902, "reward_std": 0.2912362292408943, "rewards/final_reward": 0.8603434719464518, "rewards/mask_iou_reward": 0.4301717359732259, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3144232034683228, "rewards/thk_ans_format_reward": 1.0, "step": 284, "think_completion_length": 112.25 }, { "clip_ratio": 0.0, "completion_length": 175.921875, "epoch": 0.4806070826306914, "grad_norm": 5.349575558185867, "kl": 0.23828125, "learning_rate": 9.038785834738617e-07, "loss": 0.0002, "reward": 3.142780900001526, "reward_std": 0.22693616338074207, "rewards/final_reward": 1.1724235022393383, "rewards/mask_iou_reward": 0.5862117511196692, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1427810192108154, "rewards/thk_ans_format_reward": 1.0, "step": 285, "think_completion_length": 101.96875 }, { "clip_ratio": 0.0, "completion_length": 172.921875, "epoch": 0.4822934232715008, "grad_norm": 4.78348858573279, "kl": 0.2314453125, "learning_rate": 9.035413153456997e-07, "loss": 0.0002, "reward": 3.2198691368103027, "reward_std": 0.30350401997566223, "rewards/final_reward": 1.092966510959865, "rewards/mask_iou_reward": 0.5464832554799325, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.219869077205658, "rewards/thk_ans_format_reward": 1.0, "step": 286, "think_completion_length": 111.5 }, { "clip_ratio": 0.0, "completion_length": 194.09375, "epoch": 0.4839797639123103, "grad_norm": 10.947389225955124, "kl": 0.2138671875, "learning_rate": 9.032040472175379e-07, "loss": 0.0002, "reward": 2.8175196647644043, "reward_std": 0.17539205588400364, "rewards/final_reward": 0.24655763983664442, "rewards/mask_iou_reward": 0.12327881991832221, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8175197541713715, "rewards/thk_ans_format_reward": 1.0, "step": 287, "think_completion_length": 103.5625 }, { "clip_ratio": 0.0, "completion_length": 162.890625, "epoch": 0.4856661045531197, "grad_norm": 8.330590502308805, "kl": 0.22314453125, "learning_rate": 9.02866779089376e-07, "loss": 0.0002, "reward": 2.9739983081817627, "reward_std": 0.48663294315338135, "rewards/final_reward": 0.9481654836818227, "rewards/mask_iou_reward": 0.47408274184091137, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9739983677864075, "rewards/thk_ans_format_reward": 1.0, "step": 288, "think_completion_length": 100.59375 }, { "clip_ratio": 0.0, "completion_length": 173.71875, "epoch": 0.4873524451939292, "grad_norm": 14.412230517681351, "kl": 0.24658203125, "learning_rate": 9.025295109612141e-07, "loss": 0.0002, "reward": 3.198970317840576, "reward_std": 0.4464504271745682, "rewards/final_reward": 1.0407472399001998, "rewards/mask_iou_reward": 0.5203736199500999, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1989702880382538, "rewards/thk_ans_format_reward": 1.0, "step": 289, "think_completion_length": 109.71875 }, { "clip_ratio": 0.0, "completion_length": 183.125, "epoch": 0.48903878583473864, "grad_norm": 5.205770260765118, "kl": 0.24609375, "learning_rate": 9.021922428330523e-07, "loss": 0.0002, "reward": 3.387374758720398, "reward_std": 0.2529134303331375, "rewards/final_reward": 1.6759642648813542, "rewards/mask_iou_reward": 0.8379821324406771, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3873746991157532, "rewards/thk_ans_format_reward": 1.0, "step": 290, "think_completion_length": 94.9375 }, { "clip_ratio": 0.0, "completion_length": 166.90625, "epoch": 0.49072512647554806, "grad_norm": 3.900650643571578, "kl": 0.21435546875, "learning_rate": 9.018549747048904e-07, "loss": 0.0002, "reward": 2.4927295446395874, "reward_std": 0.18346240185201168, "rewards/final_reward": 0.17499262628138723, "rewards/mask_iou_reward": 0.08749631314069362, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.4927296042442322, "rewards/thk_ans_format_reward": 1.0, "step": 291, "think_completion_length": 91.6875 }, { "clip_ratio": 0.0, "completion_length": 217.03125, "epoch": 0.4924114671163575, "grad_norm": 7.886194507737518, "kl": 0.2470703125, "learning_rate": 9.015177065767285e-07, "loss": 0.0002, "reward": 3.286818265914917, "reward_std": 0.3581873029470444, "rewards/final_reward": 1.2156858672661348, "rewards/mask_iou_reward": 0.6078429336330674, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.318068265914917, "rewards/thk_ans_format_reward": 0.984375, "step": 292, "think_completion_length": 125.34375 }, { "clip_ratio": 0.0, "completion_length": 180.40625, "epoch": 0.49409780775716694, "grad_norm": 10.927375058352002, "kl": 0.20556640625, "learning_rate": 9.011804384485667e-07, "loss": 0.0002, "reward": 3.2041701078414917, "reward_std": 0.12643220275640488, "rewards/final_reward": 1.5542415887341692, "rewards/mask_iou_reward": 0.7771207943670846, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2041699886322021, "rewards/thk_ans_format_reward": 1.0, "step": 293, "think_completion_length": 107.28125 }, { "clip_ratio": 0.0, "completion_length": 154.546875, "epoch": 0.4957841483979764, "grad_norm": 4.79296532813768, "kl": 0.236328125, "learning_rate": 9.008431703204047e-07, "loss": 0.0002, "reward": 3.6925183534622192, "reward_std": 0.17621507868170738, "rewards/final_reward": 1.5201054729840793, "rewards/mask_iou_reward": 0.7600527364920396, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.692518174648285, "rewards/thk_ans_format_reward": 1.0, "step": 294, "think_completion_length": 80.40625 }, { "clip_ratio": 0.0, "completion_length": 171.296875, "epoch": 0.4974704890387858, "grad_norm": 18.374741866025047, "kl": 0.1962890625, "learning_rate": 9.005059021922427e-07, "loss": 0.0002, "reward": 3.449096202850342, "reward_std": 0.09678211063146591, "rewards/final_reward": 1.7411923961628277, "rewards/mask_iou_reward": 0.8705961980814139, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4647212624549866, "rewards/thk_ans_format_reward": 0.984375, "step": 295, "think_completion_length": 100.21875 }, { "clip_ratio": 0.0, "completion_length": 239.671875, "epoch": 0.4991568296795953, "grad_norm": 4.434564260613753, "kl": 0.2783203125, "learning_rate": 9.001686340640809e-07, "loss": 0.0003, "reward": 2.545999526977539, "reward_std": 0.38757744431495667, "rewards/final_reward": 0.3226123989686819, "rewards/mask_iou_reward": 0.16130619948434094, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5459995269775391, "rewards/thk_ans_format_reward": 1.0, "step": 296, "think_completion_length": 86.3125 }, { "clip_ratio": 0.0, "completion_length": 192.484375, "epoch": 0.5008431703204047, "grad_norm": 23.58574110222777, "kl": 0.208984375, "learning_rate": 8.99831365935919e-07, "loss": 0.0002, "reward": 2.6845574378967285, "reward_std": 0.7191915810108185, "rewards/final_reward": 1.0216346625140318, "rewards/mask_iou_reward": 0.5108173312570159, "rewards/sam_format_reward": 0.90625, "rewards/sam_reward_func_ultra": 0.7783074378967285, "rewards/thk_ans_format_reward": 1.0, "step": 297, "think_completion_length": 102.59375 }, { "clip_ratio": 0.0, "completion_length": 154.234375, "epoch": 0.5025295109612141, "grad_norm": 9.285836380042333, "kl": 0.2109375, "learning_rate": 8.994940978077571e-07, "loss": 0.0002, "reward": 3.4048011302948, "reward_std": 0.15381522569805384, "rewards/final_reward": 1.265697850525529, "rewards/mask_iou_reward": 0.6328489252627645, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4048011302947998, "rewards/thk_ans_format_reward": 1.0, "step": 298, "think_completion_length": 92.25 }, { "clip_ratio": 0.0, "completion_length": 187.578125, "epoch": 0.5042158516020236, "grad_norm": 8.44030956368844, "kl": 0.19580078125, "learning_rate": 8.991568296795953e-07, "loss": 0.0002, "reward": 3.108208417892456, "reward_std": 0.45496051013469696, "rewards/final_reward": 1.314274766823605, "rewards/mask_iou_reward": 0.6571373834118025, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1082082986831665, "rewards/thk_ans_format_reward": 1.0, "step": 299, "think_completion_length": 95.625 }, { "clip_ratio": 0.0, "completion_length": 166.15625, "epoch": 0.5059021922428331, "grad_norm": 5.275100464525899, "kl": 0.20849609375, "learning_rate": 8.988195615514334e-07, "loss": 0.0002, "reward": 3.0727845430374146, "reward_std": 0.338802233338356, "rewards/final_reward": 1.0132605632547391, "rewards/mask_iou_reward": 0.5066302816273696, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.072784423828125, "rewards/thk_ans_format_reward": 1.0, "step": 300, "think_completion_length": 108.9375 }, { "clip_ratio": 0.0, "completion_length": 160.71875, "epoch": 0.5075885328836425, "grad_norm": 8.013159633748892, "kl": 0.20458984375, "learning_rate": 8.984822934232715e-07, "loss": 0.0002, "reward": 3.0154377222061157, "reward_std": 0.28923243284225464, "rewards/final_reward": 0.7514977248736657, "rewards/mask_iou_reward": 0.37574886243683286, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0154377818107605, "rewards/thk_ans_format_reward": 1.0, "step": 301, "think_completion_length": 88.53125 }, { "clip_ratio": 0.0, "completion_length": 196.578125, "epoch": 0.5092748735244519, "grad_norm": 4.742694603027937, "kl": 0.22265625, "learning_rate": 8.981450252951097e-07, "loss": 0.0002, "reward": 3.021862030029297, "reward_std": 0.45061095058918, "rewards/final_reward": 0.8931014002412393, "rewards/mask_iou_reward": 0.44655070012061965, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.0531122088432312, "rewards/thk_ans_format_reward": 0.984375, "step": 302, "think_completion_length": 108.0625 }, { "clip_ratio": 0.0, "completion_length": 162.296875, "epoch": 0.5109612141652614, "grad_norm": 8.474267712901908, "kl": 0.251953125, "learning_rate": 8.978077571669476e-07, "loss": 0.0003, "reward": 2.9662917852401733, "reward_std": 0.6356634199619293, "rewards/final_reward": 1.029761918343917, "rewards/mask_iou_reward": 0.5148809591719585, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9662917256355286, "rewards/thk_ans_format_reward": 1.0, "step": 303, "think_completion_length": 82.96875 }, { "clip_ratio": 0.0, "completion_length": 163.78125, "epoch": 0.5126475548060708, "grad_norm": 7.372693309576553, "kl": 0.2939453125, "learning_rate": 8.974704890387857e-07, "loss": 0.0003, "reward": 2.326613187789917, "reward_std": 0.2381967380642891, "rewards/final_reward": 0.05411758343514183, "rewards/mask_iou_reward": 0.027058791717570915, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.3266131319105625, "rewards/thk_ans_format_reward": 1.0, "step": 304, "think_completion_length": 86.125 }, { "clip_ratio": 0.0, "completion_length": 146.421875, "epoch": 0.5143338954468802, "grad_norm": 8.446925811165816, "kl": 0.224609375, "learning_rate": 8.971332209106239e-07, "loss": 0.0002, "reward": 3.1610687971115112, "reward_std": 0.26836006343364716, "rewards/final_reward": 0.9938329226396494, "rewards/mask_iou_reward": 0.4969164613198247, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1610687971115112, "rewards/thk_ans_format_reward": 1.0, "step": 305, "think_completion_length": 80.125 }, { "clip_ratio": 0.0, "completion_length": 162.234375, "epoch": 0.5160202360876898, "grad_norm": 8.702016230273058, "kl": 0.2177734375, "learning_rate": 8.96795952782462e-07, "loss": 0.0002, "reward": 3.011886239051819, "reward_std": 0.32331761717796326, "rewards/final_reward": 0.919041138538325, "rewards/mask_iou_reward": 0.4595205692691625, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0118862390518188, "rewards/thk_ans_format_reward": 1.0, "step": 306, "think_completion_length": 85.9375 }, { "clip_ratio": 0.0, "completion_length": 145.9375, "epoch": 0.5177065767284992, "grad_norm": 3.980012513394504, "kl": 0.2255859375, "learning_rate": 8.964586846543001e-07, "loss": 0.0002, "reward": 2.9461448192596436, "reward_std": 0.10024909558705986, "rewards/final_reward": 1.7532795550322358, "rewards/mask_iou_reward": 0.8766397775161179, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9461447596549988, "rewards/thk_ans_format_reward": 1.0, "step": 307, "think_completion_length": 75.25 }, { "clip_ratio": 0.0, "completion_length": 150.0625, "epoch": 0.5193929173693086, "grad_norm": 7.147985372138769, "kl": 0.2197265625, "learning_rate": 8.961214165261383e-07, "loss": 0.0002, "reward": 2.450806975364685, "reward_std": 0.22551338374614716, "rewards/final_reward": 0.2143655313679812, "rewards/mask_iou_reward": 0.1071827656839906, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.45080701261758804, "rewards/thk_ans_format_reward": 1.0, "step": 308, "think_completion_length": 81.3125 }, { "clip_ratio": 0.0, "completion_length": 142.421875, "epoch": 0.521079258010118, "grad_norm": 5.551548351845276, "kl": 0.23486328125, "learning_rate": 8.957841483979764e-07, "loss": 0.0002, "reward": 3.691397547721863, "reward_std": 0.05219288542866707, "rewards/final_reward": 1.5435201661631972, "rewards/mask_iou_reward": 0.7717600830815986, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6913974285125732, "rewards/thk_ans_format_reward": 1.0, "step": 309, "think_completion_length": 79.375 }, { "clip_ratio": 0.0, "completion_length": 142.203125, "epoch": 0.5227655986509275, "grad_norm": 11.529649035295114, "kl": 0.24609375, "learning_rate": 8.954468802698145e-07, "loss": 0.0002, "reward": 2.901958465576172, "reward_std": 0.31998536735773087, "rewards/final_reward": 0.873896867313297, "rewards/mask_iou_reward": 0.4369484336566485, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9019584059715271, "rewards/thk_ans_format_reward": 1.0, "step": 310, "think_completion_length": 66.21875 }, { "clip_ratio": 0.0, "completion_length": 143.96875, "epoch": 0.524451939291737, "grad_norm": 6.252373978112095, "kl": 0.25439453125, "learning_rate": 8.951096121416525e-07, "loss": 0.0003, "reward": 2.746753692626953, "reward_std": 0.17325819842517376, "rewards/final_reward": 0.7760384249471024, "rewards/mask_iou_reward": 0.3880192124735512, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7467536926269531, "rewards/thk_ans_format_reward": 1.0, "step": 311, "think_completion_length": 76.78125 }, { "clip_ratio": 0.0, "completion_length": 137.09375, "epoch": 0.5261382799325464, "grad_norm": 17.287381416733663, "kl": 0.326171875, "learning_rate": 8.947723440134906e-07, "loss": 0.0003, "reward": 3.181082844734192, "reward_std": 0.42962639033794403, "rewards/final_reward": 1.0247396851042567, "rewards/mask_iou_reward": 0.5123698425521284, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.181082844734192, "rewards/thk_ans_format_reward": 1.0, "step": 312, "think_completion_length": 70.4375 }, { "clip_ratio": 0.0, "completion_length": 137.8125, "epoch": 0.5278246205733558, "grad_norm": 4.662803047611527, "kl": 0.255859375, "learning_rate": 8.944350758853288e-07, "loss": 0.0002, "reward": 3.6371694803237915, "reward_std": 0.04736559418961406, "rewards/final_reward": 1.8132385278320307, "rewards/mask_iou_reward": 0.9066192639160153, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6371695399284363, "rewards/thk_ans_format_reward": 1.0, "step": 313, "think_completion_length": 71.5625 }, { "clip_ratio": 0.0, "completion_length": 137.453125, "epoch": 0.5295109612141653, "grad_norm": 5.2767043392470905, "kl": 0.2578125, "learning_rate": 8.940978077571669e-07, "loss": 0.0003, "reward": 2.710046172142029, "reward_std": 0.3843417316675186, "rewards/final_reward": 0.8192079907416744, "rewards/mask_iou_reward": 0.4096039953708372, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7100460827350616, "rewards/thk_ans_format_reward": 1.0, "step": 314, "think_completion_length": 65.40625 }, { "clip_ratio": 0.0, "completion_length": 142.0625, "epoch": 0.5311973018549747, "grad_norm": 5.645559638838532, "kl": 0.2685546875, "learning_rate": 8.93760539629005e-07, "loss": 0.0003, "reward": 2.684740424156189, "reward_std": 0.3946193754673004, "rewards/final_reward": 0.541776743554244, "rewards/mask_iou_reward": 0.270888371777122, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6847403347492218, "rewards/thk_ans_format_reward": 1.0, "step": 315, "think_completion_length": 76.90625 }, { "clip_ratio": 0.0, "completion_length": 138.53125, "epoch": 0.5328836424957841, "grad_norm": 3.2402841409799064, "kl": 0.25, "learning_rate": 8.934232715008432e-07, "loss": 0.0002, "reward": 3.3814308643341064, "reward_std": 0.13638974726200104, "rewards/final_reward": 1.1814053927053858, "rewards/mask_iou_reward": 0.5907026963526929, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.381430983543396, "rewards/thk_ans_format_reward": 1.0, "step": 316, "think_completion_length": 73.96875 }, { "clip_ratio": 0.0, "completion_length": 135.453125, "epoch": 0.5345699831365935, "grad_norm": 11.639876433776317, "kl": 0.2314453125, "learning_rate": 8.930860033726813e-07, "loss": 0.0002, "reward": 2.7054232358932495, "reward_std": 0.3478597477078438, "rewards/final_reward": 1.0303473977491717, "rewards/mask_iou_reward": 0.5151736988745859, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7054231911897659, "rewards/thk_ans_format_reward": 1.0, "step": 317, "think_completion_length": 66.375 }, { "clip_ratio": 0.0, "completion_length": 135.5, "epoch": 0.5362563237774031, "grad_norm": 3.7880326698570377, "kl": 0.24658203125, "learning_rate": 8.927487352445194e-07, "loss": 0.0002, "reward": 2.7247750759124756, "reward_std": 0.29200705885887146, "rewards/final_reward": 0.6541380888774231, "rewards/mask_iou_reward": 0.32706904443871154, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7247750461101532, "rewards/thk_ans_format_reward": 1.0, "step": 318, "think_completion_length": 64.03125 }, { "clip_ratio": 0.0, "completion_length": 133.6875, "epoch": 0.5379426644182125, "grad_norm": 11.191965779893017, "kl": 0.23828125, "learning_rate": 8.924114671163576e-07, "loss": 0.0002, "reward": 3.127274513244629, "reward_std": 0.27082036435604095, "rewards/final_reward": 0.8115027511302748, "rewards/mask_iou_reward": 0.4057513755651374, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1272744536399841, "rewards/thk_ans_format_reward": 1.0, "step": 319, "think_completion_length": 58.9375 }, { "clip_ratio": 0.0, "completion_length": 137.703125, "epoch": 0.5396290050590219, "grad_norm": 6.2600404543257895, "kl": 0.294921875, "learning_rate": 8.920741989881955e-07, "loss": 0.0003, "reward": 3.504320979118347, "reward_std": 0.2580454498529434, "rewards/final_reward": 1.7895027190421027, "rewards/mask_iou_reward": 0.8947513595210513, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.504321038722992, "rewards/thk_ans_format_reward": 1.0, "step": 320, "think_completion_length": 68.8125 }, { "clip_ratio": 0.0, "completion_length": 127.078125, "epoch": 0.5413153456998314, "grad_norm": 4.75179766813063, "kl": 0.2607421875, "learning_rate": 8.917369308600336e-07, "loss": 0.0003, "reward": 2.915022373199463, "reward_std": 0.38734908401966095, "rewards/final_reward": 0.9229503567709705, "rewards/mask_iou_reward": 0.46147517838548524, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9150223135948181, "rewards/thk_ans_format_reward": 1.0, "step": 321, "think_completion_length": 61.28125 }, { "clip_ratio": 0.0, "completion_length": 134.59375, "epoch": 0.5430016863406408, "grad_norm": 23.226641449249314, "kl": 0.36572265625, "learning_rate": 8.913996627318718e-07, "loss": 0.0004, "reward": 3.505223870277405, "reward_std": 0.25062863528728485, "rewards/final_reward": 1.6520057781269646, "rewards/mask_iou_reward": 0.8260028890634823, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5052238702774048, "rewards/thk_ans_format_reward": 1.0, "step": 322, "think_completion_length": 70.25 }, { "clip_ratio": 0.0, "completion_length": 132.859375, "epoch": 0.5446880269814502, "grad_norm": 4.361418731881005, "kl": 0.404296875, "learning_rate": 8.910623946037099e-07, "loss": 0.0004, "reward": 3.1083903312683105, "reward_std": 0.23866655677556992, "rewards/final_reward": 0.7571974737717716, "rewards/mask_iou_reward": 0.3785987368858858, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1083903908729553, "rewards/thk_ans_format_reward": 1.0, "step": 323, "think_completion_length": 62.25 }, { "clip_ratio": 0.0, "completion_length": 128.9375, "epoch": 0.5463743676222597, "grad_norm": 4.5546780251605385, "kl": 0.248046875, "learning_rate": 8.90725126475548e-07, "loss": 0.0002, "reward": 2.8330706357955933, "reward_std": 0.14445627853274345, "rewards/final_reward": 0.9145134767110203, "rewards/mask_iou_reward": 0.4572567383555102, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8330707550048828, "rewards/thk_ans_format_reward": 1.0, "step": 324, "think_completion_length": 62.4375 }, { "clip_ratio": 0.0, "completion_length": 126.96875, "epoch": 0.5480607082630692, "grad_norm": 54.15924549523269, "kl": 3.556640625, "learning_rate": 8.903878583473862e-07, "loss": 0.0036, "reward": 2.7553576231002808, "reward_std": 0.31073715165257454, "rewards/final_reward": 1.0346113208464451, "rewards/mask_iou_reward": 0.5173056604232226, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7553575932979584, "rewards/thk_ans_format_reward": 1.0, "step": 325, "think_completion_length": 55.375 }, { "clip_ratio": 0.0, "completion_length": 129.09375, "epoch": 0.5497470489038786, "grad_norm": 72.79025141301403, "kl": 0.2392578125, "learning_rate": 8.900505902192243e-07, "loss": 0.0002, "reward": 3.7254384756088257, "reward_std": 0.11009544506669044, "rewards/final_reward": 1.8056284891474292, "rewards/mask_iou_reward": 0.9028142445737146, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7254384756088257, "rewards/thk_ans_format_reward": 1.0, "step": 326, "think_completion_length": 65.09375 }, { "clip_ratio": 0.0, "completion_length": 130.90625, "epoch": 0.551433389544688, "grad_norm": 6.158065182698799, "kl": 0.259765625, "learning_rate": 8.897133220910623e-07, "loss": 0.0003, "reward": 2.8164632320404053, "reward_std": 0.2505396902561188, "rewards/final_reward": 0.820963424891961, "rewards/mask_iou_reward": 0.4104817124459805, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8164632171392441, "rewards/thk_ans_format_reward": 1.0, "step": 327, "think_completion_length": 60.78125 }, { "clip_ratio": 0.0, "completion_length": 131.71875, "epoch": 0.5531197301854974, "grad_norm": 6.008582279919581, "kl": 0.27001953125, "learning_rate": 8.893760539629005e-07, "loss": 0.0003, "reward": 2.9342983961105347, "reward_std": 0.48896993696689606, "rewards/final_reward": 0.9667943819377082, "rewards/mask_iou_reward": 0.4833971909688541, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.934298574924469, "rewards/thk_ans_format_reward": 1.0, "step": 328, "think_completion_length": 67.21875 }, { "clip_ratio": 0.0, "completion_length": 122.359375, "epoch": 0.554806070826307, "grad_norm": 5.243788107627332, "kl": 0.27490234375, "learning_rate": 8.890387858347385e-07, "loss": 0.0003, "reward": 3.6061549186706543, "reward_std": 0.32689087092876434, "rewards/final_reward": 1.5299962392263828, "rewards/mask_iou_reward": 0.7649981196131914, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6061549186706543, "rewards/thk_ans_format_reward": 1.0, "step": 329, "think_completion_length": 55.1875 }, { "clip_ratio": 0.0, "completion_length": 126.75, "epoch": 0.5564924114671164, "grad_norm": 4.358233922614152, "kl": 0.24072265625, "learning_rate": 8.887015177065766e-07, "loss": 0.0002, "reward": 3.3998658657073975, "reward_std": 0.2283829301595688, "rewards/final_reward": 1.1769258962723228, "rewards/mask_iou_reward": 0.5884629481361614, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.399865746498108, "rewards/thk_ans_format_reward": 1.0, "step": 330, "think_completion_length": 61.9375 }, { "clip_ratio": 0.0, "completion_length": 126.953125, "epoch": 0.5581787521079258, "grad_norm": 5.040552564523054, "kl": 0.2470703125, "learning_rate": 8.883642495784148e-07, "loss": 0.0002, "reward": 2.9218242168426514, "reward_std": 0.3060344159603119, "rewards/final_reward": 1.0111649972253083, "rewards/mask_iou_reward": 0.5055824986126541, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9218244031071663, "rewards/thk_ans_format_reward": 1.0, "step": 331, "think_completion_length": 53.40625 }, { "clip_ratio": 0.0, "completion_length": 121.921875, "epoch": 0.5598650927487352, "grad_norm": 13.687986163382138, "kl": 0.25439453125, "learning_rate": 8.880269814502529e-07, "loss": 0.0003, "reward": 3.163904905319214, "reward_std": 0.24482608400285244, "rewards/final_reward": 1.5012755393722892, "rewards/mask_iou_reward": 0.7506377696861446, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.179529845714569, "rewards/thk_ans_format_reward": 0.984375, "step": 332, "think_completion_length": 51.0625 }, { "clip_ratio": 0.0, "completion_length": 137.5, "epoch": 0.5615514333895447, "grad_norm": 4.203445201103453, "kl": 0.25048828125, "learning_rate": 8.87689713322091e-07, "loss": 0.0003, "reward": 2.74581241607666, "reward_std": 0.26485906541347504, "rewards/final_reward": 1.1390789726854125, "rewards/mask_iou_reward": 0.5695394863427062, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7458123862743378, "rewards/thk_ans_format_reward": 1.0, "step": 333, "think_completion_length": 54.375 }, { "clip_ratio": 0.0, "completion_length": 124.796875, "epoch": 0.5632377740303541, "grad_norm": 4.155544073994526, "kl": 0.2412109375, "learning_rate": 8.873524451939292e-07, "loss": 0.0002, "reward": 2.7413270473480225, "reward_std": 0.19857023283839226, "rewards/final_reward": 0.8799631358931382, "rewards/mask_iou_reward": 0.4399815679465691, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7413269579410553, "rewards/thk_ans_format_reward": 1.0, "step": 334, "think_completion_length": 62.71875 }, { "clip_ratio": 0.0, "completion_length": 120.15625, "epoch": 0.5649241146711635, "grad_norm": 4.1431683610726875, "kl": 0.2841796875, "learning_rate": 8.870151770657673e-07, "loss": 0.0003, "reward": 3.1733168363571167, "reward_std": 0.3433510363101959, "rewards/final_reward": 0.6148122126354274, "rewards/mask_iou_reward": 0.3074061063177137, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.173316776752472, "rewards/thk_ans_format_reward": 1.0, "step": 335, "think_completion_length": 48.09375 }, { "clip_ratio": 0.0, "completion_length": 136.703125, "epoch": 0.5666104553119731, "grad_norm": 5.87787727228753, "kl": 0.26904296875, "learning_rate": 8.866779089376053e-07, "loss": 0.0003, "reward": 2.931514024734497, "reward_std": 0.17161303758621216, "rewards/final_reward": 1.1005590800769713, "rewards/mask_iou_reward": 0.5502795400384857, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9315140843391418, "rewards/thk_ans_format_reward": 1.0, "step": 336, "think_completion_length": 55.6875 }, { "clip_ratio": 0.0, "completion_length": 128.046875, "epoch": 0.5682967959527825, "grad_norm": 4.990989117785591, "kl": 0.251953125, "learning_rate": 8.863406408094435e-07, "loss": 0.0002, "reward": 3.413249135017395, "reward_std": 0.1451067440211773, "rewards/final_reward": 1.8223518536043777, "rewards/mask_iou_reward": 0.9111759268021888, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4132491946220398, "rewards/thk_ans_format_reward": 1.0, "step": 337, "think_completion_length": 55.4375 }, { "clip_ratio": 0.0, "completion_length": 127.4375, "epoch": 0.5699831365935919, "grad_norm": 27.3955634919466, "kl": 0.369140625, "learning_rate": 8.860033726812815e-07, "loss": 0.0004, "reward": 3.265120506286621, "reward_std": 0.2976267971098423, "rewards/final_reward": 1.0012632907001968, "rewards/mask_iou_reward": 0.5006316453500984, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2807455658912659, "rewards/thk_ans_format_reward": 0.984375, "step": 338, "think_completion_length": 58.6875 }, { "clip_ratio": 0.0, "completion_length": 121.0, "epoch": 0.5716694772344013, "grad_norm": 5.072182504253793, "kl": 0.263671875, "learning_rate": 8.856661045531197e-07, "loss": 0.0003, "reward": 3.1486185789108276, "reward_std": 0.3894375190138817, "rewards/final_reward": 1.375238376644695, "rewards/mask_iou_reward": 0.6876191883223475, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1486185491085052, "rewards/thk_ans_format_reward": 1.0, "step": 339, "think_completion_length": 51.78125 }, { "clip_ratio": 0.0, "completion_length": 127.796875, "epoch": 0.5733558178752108, "grad_norm": 4.334373316071912, "kl": 0.263671875, "learning_rate": 8.853288364249578e-07, "loss": 0.0003, "reward": 3.1587361097335815, "reward_std": 0.3358212560415268, "rewards/final_reward": 1.353470388714372, "rewards/mask_iou_reward": 0.676735194357186, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1587361097335815, "rewards/thk_ans_format_reward": 1.0, "step": 340, "think_completion_length": 60.71875 }, { "clip_ratio": 0.0, "completion_length": 126.046875, "epoch": 0.5750421585160203, "grad_norm": 5.806740334340846, "kl": 0.2587890625, "learning_rate": 8.849915682967959e-07, "loss": 0.0003, "reward": 3.818936824798584, "reward_std": 0.04810533579438925, "rewards/final_reward": 1.8369189113220132, "rewards/mask_iou_reward": 0.9184594556610066, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.818936824798584, "rewards/thk_ans_format_reward": 1.0, "step": 341, "think_completion_length": 66.15625 }, { "clip_ratio": 0.0, "completion_length": 151.71875, "epoch": 0.5767284991568297, "grad_norm": 6.058374485035506, "kl": 0.4013671875, "learning_rate": 8.846543001686341e-07, "loss": 0.0004, "reward": 3.5225307941436768, "reward_std": 0.37527384608983994, "rewards/final_reward": 1.4858986672894283, "rewards/mask_iou_reward": 0.7429493336447142, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.522530734539032, "rewards/thk_ans_format_reward": 1.0, "step": 342, "think_completion_length": 61.5625 }, { "clip_ratio": 0.0, "completion_length": 124.640625, "epoch": 0.5784148397976391, "grad_norm": 3.3695996161749124, "kl": 0.23681640625, "learning_rate": 8.843170320404722e-07, "loss": 0.0002, "reward": 3.1007397174835205, "reward_std": 0.12492630630731583, "rewards/final_reward": 1.1520376838351214, "rewards/mask_iou_reward": 0.5760188419175607, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.100739747285843, "rewards/thk_ans_format_reward": 1.0, "step": 343, "think_completion_length": 54.15625 }, { "clip_ratio": 0.0, "completion_length": 119.203125, "epoch": 0.5801011804384486, "grad_norm": 4.939004747976328, "kl": 0.28857421875, "learning_rate": 8.839797639123102e-07, "loss": 0.0003, "reward": 3.2634146213531494, "reward_std": 0.3214031755924225, "rewards/final_reward": 0.9727509594520076, "rewards/mask_iou_reward": 0.4863754797260038, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2634146213531494, "rewards/thk_ans_format_reward": 1.0, "step": 344, "think_completion_length": 50.15625 }, { "clip_ratio": 0.0, "completion_length": 158.546875, "epoch": 0.581787521079258, "grad_norm": 7.867013326293164, "kl": 0.25634765625, "learning_rate": 8.836424957841484e-07, "loss": 0.0003, "reward": 2.949946165084839, "reward_std": 0.45425738394260406, "rewards/final_reward": 0.9740685863298226, "rewards/mask_iou_reward": 0.4870342931649113, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.9655711352825165, "rewards/thk_ans_format_reward": 1.0, "step": 345, "think_completion_length": 60.90625 }, { "clip_ratio": 0.0, "completion_length": 138.875, "epoch": 0.5834738617200674, "grad_norm": 3.5260809492982257, "kl": 0.3115234375, "learning_rate": 8.833052276559864e-07, "loss": 0.0003, "reward": 2.9657833576202393, "reward_std": 0.3081818874925375, "rewards/final_reward": 1.1107575452706013, "rewards/mask_iou_reward": 0.5553787726353007, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.9814085066318512, "rewards/thk_ans_format_reward": 1.0, "step": 346, "think_completion_length": 62.03125 }, { "clip_ratio": 0.0, "completion_length": 127.4375, "epoch": 0.5851602023608768, "grad_norm": 5.035821733992209, "kl": 0.25048828125, "learning_rate": 8.829679595278245e-07, "loss": 0.0003, "reward": 2.996280312538147, "reward_std": 0.13460349664092064, "rewards/final_reward": 1.308326504855776, "rewards/mask_iou_reward": 0.654163252427888, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9962801933288574, "rewards/thk_ans_format_reward": 1.0, "step": 347, "think_completion_length": 63.875 }, { "clip_ratio": 0.0, "completion_length": 130.78125, "epoch": 0.5868465430016864, "grad_norm": 27.576882428955795, "kl": 0.2685546875, "learning_rate": 8.826306913996627e-07, "loss": 0.0003, "reward": 2.9972203969955444, "reward_std": 0.2722517102956772, "rewards/final_reward": 0.8487660003192941, "rewards/mask_iou_reward": 0.42438300015964703, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9972204267978668, "rewards/thk_ans_format_reward": 1.0, "step": 348, "think_completion_length": 68.21875 }, { "clip_ratio": 0.0, "completion_length": 134.65625, "epoch": 0.5885328836424958, "grad_norm": 4.389319652797556, "kl": 0.24609375, "learning_rate": 8.822934232715008e-07, "loss": 0.0002, "reward": 2.865875244140625, "reward_std": 0.13240397721529007, "rewards/final_reward": 0.9376044149712613, "rewards/mask_iou_reward": 0.4688022074856307, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8658752739429474, "rewards/thk_ans_format_reward": 1.0, "step": 349, "think_completion_length": 59.96875 }, { "clip_ratio": 0.0, "completion_length": 129.625, "epoch": 0.5902192242833052, "grad_norm": 5.101680634393306, "kl": 0.43115234375, "learning_rate": 8.819561551433389e-07, "loss": 0.0004, "reward": 3.332337260246277, "reward_std": 0.3843718320131302, "rewards/final_reward": 1.221562650525076, "rewards/mask_iou_reward": 0.610781325262538, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3323372602462769, "rewards/thk_ans_format_reward": 1.0, "step": 350, "think_completion_length": 56.1875 }, { "clip_ratio": 0.0, "completion_length": 132.765625, "epoch": 0.5919055649241147, "grad_norm": 6.998265450002808, "kl": 0.25390625, "learning_rate": 8.816188870151771e-07, "loss": 0.0003, "reward": 2.870605945587158, "reward_std": 0.2439437434077263, "rewards/final_reward": 0.754005258460424, "rewards/mask_iou_reward": 0.377002629230212, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8706059455871582, "rewards/thk_ans_format_reward": 1.0, "step": 351, "think_completion_length": 57.5625 }, { "clip_ratio": 0.0, "completion_length": 143.46875, "epoch": 0.5935919055649241, "grad_norm": 6.202091538730412, "kl": 0.251953125, "learning_rate": 8.812816188870151e-07, "loss": 0.0003, "reward": 3.2305729389190674, "reward_std": 0.4073144942522049, "rewards/final_reward": 1.0945567016798163, "rewards/mask_iou_reward": 0.5472783508399082, "rewards/sam_format_reward": 0.953125, "rewards/sam_reward_func_ultra": 1.2774479985237122, "rewards/thk_ans_format_reward": 1.0, "step": 352, "think_completion_length": 63.65625 }, { "clip_ratio": 0.0, "completion_length": 125.5, "epoch": 0.5952782462057336, "grad_norm": 5.579559586465592, "kl": 0.2841796875, "learning_rate": 8.809443507588532e-07, "loss": 0.0003, "reward": 3.342374801635742, "reward_std": 0.39917896687984467, "rewards/final_reward": 1.1869944150046479, "rewards/mask_iou_reward": 0.5934972075023239, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3423748016357422, "rewards/thk_ans_format_reward": 1.0, "step": 353, "think_completion_length": 60.28125 }, { "clip_ratio": 0.0, "completion_length": 130.421875, "epoch": 0.596964586846543, "grad_norm": 5.020379721979836, "kl": 0.275390625, "learning_rate": 8.806070826306914e-07, "loss": 0.0003, "reward": 3.1944527626037598, "reward_std": 0.2604561969637871, "rewards/final_reward": 1.854011010006161, "rewards/mask_iou_reward": 0.9270055050030805, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.194452702999115, "rewards/thk_ans_format_reward": 1.0, "step": 354, "think_completion_length": 60.46875 }, { "clip_ratio": 0.0, "completion_length": 163.890625, "epoch": 0.5986509274873525, "grad_norm": 13.02489612056237, "kl": 0.29736328125, "learning_rate": 8.802698145025294e-07, "loss": 0.0003, "reward": 2.82450008392334, "reward_std": 0.2679259032011032, "rewards/final_reward": 0.8881105820850812, "rewards/mask_iou_reward": 0.4440552910425406, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8245000541210175, "rewards/thk_ans_format_reward": 1.0, "step": 355, "think_completion_length": 69.21875 }, { "clip_ratio": 0.0, "completion_length": 140.59375, "epoch": 0.6003372681281619, "grad_norm": 23.063711576098893, "kl": 0.232421875, "learning_rate": 8.799325463743675e-07, "loss": 0.0002, "reward": 2.9744956493377686, "reward_std": 0.2751796506345272, "rewards/final_reward": 1.4443419340676278, "rewards/mask_iou_reward": 0.7221709670338139, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.9901207089424133, "rewards/thk_ans_format_reward": 1.0, "step": 356, "think_completion_length": 64.75 }, { "clip_ratio": 0.0, "completion_length": 139.953125, "epoch": 0.6020236087689713, "grad_norm": 8.104848340003967, "kl": 0.2822265625, "learning_rate": 8.795952782462057e-07, "loss": 0.0003, "reward": 2.4996743202209473, "reward_std": 0.39900892972946167, "rewards/final_reward": 0.0625, "rewards/mask_iou_reward": 0.03125, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 0.5465493500232697, "rewards/thk_ans_format_reward": 0.984375, "step": 357, "think_completion_length": 56.625 }, { "clip_ratio": 0.0, "completion_length": 130.0625, "epoch": 0.6037099494097807, "grad_norm": 4.853883488792399, "kl": 0.2734375, "learning_rate": 8.792580101180438e-07, "loss": 0.0003, "reward": 3.561056137084961, "reward_std": 0.40073344111442566, "rewards/final_reward": 1.6063573685327528, "rewards/mask_iou_reward": 0.8031786842663764, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5610561966896057, "rewards/thk_ans_format_reward": 1.0, "step": 358, "think_completion_length": 56.125 }, { "clip_ratio": 0.0, "completion_length": 142.859375, "epoch": 0.6053962900505903, "grad_norm": 5.976143731168656, "kl": 0.2568359375, "learning_rate": 8.789207419898819e-07, "loss": 0.0003, "reward": 3.239910840988159, "reward_std": 0.3226969689130783, "rewards/final_reward": 1.1236291825562077, "rewards/mask_iou_reward": 0.5618145912781038, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2399107217788696, "rewards/thk_ans_format_reward": 1.0, "step": 359, "think_completion_length": 68.84375 }, { "clip_ratio": 0.0, "completion_length": 126.578125, "epoch": 0.6070826306913997, "grad_norm": 5.128737900496672, "kl": 0.263671875, "learning_rate": 8.785834738617201e-07, "loss": 0.0003, "reward": 2.7771421670913696, "reward_std": 0.5488343238830566, "rewards/final_reward": 0.6804320378228413, "rewards/mask_iou_reward": 0.34021601891142067, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7771420478820801, "rewards/thk_ans_format_reward": 1.0, "step": 360, "think_completion_length": 61.65625 }, { "clip_ratio": 0.0, "completion_length": 132.125, "epoch": 0.6087689713322091, "grad_norm": 4.894849899866179, "kl": 0.2734375, "learning_rate": 8.782462057335581e-07, "loss": 0.0003, "reward": 3.0549187660217285, "reward_std": 0.17568432539701462, "rewards/final_reward": 1.432484962181782, "rewards/mask_iou_reward": 0.716242481090891, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0549187511205673, "rewards/thk_ans_format_reward": 1.0, "step": 361, "think_completion_length": 59.03125 }, { "clip_ratio": 0.0, "completion_length": 151.875, "epoch": 0.6104553119730185, "grad_norm": 5.639323127623386, "kl": 0.3330078125, "learning_rate": 8.779089376053963e-07, "loss": 0.0003, "reward": 3.342157244682312, "reward_std": 0.3479279577732086, "rewards/final_reward": 1.4514362461598367, "rewards/mask_iou_reward": 0.7257181230799183, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3421571850776672, "rewards/thk_ans_format_reward": 1.0, "step": 362, "think_completion_length": 67.8125 }, { "clip_ratio": 0.0, "completion_length": 128.59375, "epoch": 0.612141652613828, "grad_norm": 9.40098674825858, "kl": 0.3349609375, "learning_rate": 8.775716694772344e-07, "loss": 0.0003, "reward": 3.223899245262146, "reward_std": 0.177157923579216, "rewards/final_reward": 1.1118048766189446, "rewards/mask_iou_reward": 0.5559024383094723, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2238992750644684, "rewards/thk_ans_format_reward": 1.0, "step": 363, "think_completion_length": 64.78125 }, { "clip_ratio": 0.0, "completion_length": 167.15625, "epoch": 0.6138279932546374, "grad_norm": 59.3341930780702, "kl": 0.22998046875, "learning_rate": 8.772344013490724e-07, "loss": 0.0002, "reward": 3.087131977081299, "reward_std": 0.4654431641101837, "rewards/final_reward": 1.21102136613791, "rewards/mask_iou_reward": 0.605510683068955, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.087131917476654, "rewards/thk_ans_format_reward": 1.0, "step": 364, "think_completion_length": 64.25 }, { "clip_ratio": 0.0, "completion_length": 128.640625, "epoch": 0.6155143338954469, "grad_norm": 5.181103170179545, "kl": 0.3125, "learning_rate": 8.768971332209106e-07, "loss": 0.0003, "reward": 3.7247079610824585, "reward_std": 0.22579550743103027, "rewards/final_reward": 1.766907703986788, "rewards/mask_iou_reward": 0.883453851993394, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.7403329610824585, "rewards/thk_ans_format_reward": 1.0, "step": 365, "think_completion_length": 61.625 }, { "clip_ratio": 0.0, "completion_length": 131.9375, "epoch": 0.6172006745362564, "grad_norm": 7.005506467477656, "kl": 0.2734375, "learning_rate": 8.765598650927487e-07, "loss": 0.0003, "reward": 3.311755895614624, "reward_std": 0.09347007237374783, "rewards/final_reward": 0.9238924283518867, "rewards/mask_iou_reward": 0.46194621417594334, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3117559254169464, "rewards/thk_ans_format_reward": 1.0, "step": 366, "think_completion_length": 64.09375 }, { "clip_ratio": 0.0, "completion_length": 135.109375, "epoch": 0.6188870151770658, "grad_norm": 26.431305097403335, "kl": 0.2783203125, "learning_rate": 8.762225969645868e-07, "loss": 0.0003, "reward": 3.2874670028686523, "reward_std": 0.3315463215112686, "rewards/final_reward": 0.8942609141714675, "rewards/mask_iou_reward": 0.44713045708573373, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2874669432640076, "rewards/thk_ans_format_reward": 1.0, "step": 367, "think_completion_length": 73.25 }, { "clip_ratio": 0.0, "completion_length": 124.328125, "epoch": 0.6205733558178752, "grad_norm": 22.45281866642402, "kl": 0.384765625, "learning_rate": 8.75885328836425e-07, "loss": 0.0004, "reward": 3.3114068508148193, "reward_std": 0.14964250102639198, "rewards/final_reward": 1.2569832236958134, "rewards/mask_iou_reward": 0.6284916118479067, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3114069104194641, "rewards/thk_ans_format_reward": 1.0, "step": 368, "think_completion_length": 57.78125 }, { "clip_ratio": 0.0, "completion_length": 141.578125, "epoch": 0.6222596964586846, "grad_norm": 5.267594297758696, "kl": 0.287109375, "learning_rate": 8.75548060708263e-07, "loss": 0.0003, "reward": 3.296778082847595, "reward_std": 0.28996123373508453, "rewards/final_reward": 1.7162430048361945, "rewards/mask_iou_reward": 0.8581215024180973, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2967780828475952, "rewards/thk_ans_format_reward": 1.0, "step": 369, "think_completion_length": 68.75 }, { "clip_ratio": 0.0, "completion_length": 132.734375, "epoch": 0.6239460370994941, "grad_norm": 32.26698855157922, "kl": 0.25244140625, "learning_rate": 8.752107925801011e-07, "loss": 0.0002, "reward": 3.587504506111145, "reward_std": 0.20235136337578297, "rewards/final_reward": 1.6433316468966788, "rewards/mask_iou_reward": 0.8216658234483394, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.587504506111145, "rewards/thk_ans_format_reward": 1.0, "step": 370, "think_completion_length": 56.03125 }, { "clip_ratio": 0.0, "completion_length": 133.453125, "epoch": 0.6256323777403036, "grad_norm": 36.90675958939858, "kl": 0.25830078125, "learning_rate": 8.748735244519393e-07, "loss": 0.0003, "reward": 3.4644440412521362, "reward_std": 0.14919602870941162, "rewards/final_reward": 1.8359068655427828, "rewards/mask_iou_reward": 0.9179534327713914, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4644440412521362, "rewards/thk_ans_format_reward": 1.0, "step": 371, "think_completion_length": 63.3125 }, { "clip_ratio": 0.0, "completion_length": 131.5625, "epoch": 0.627318718381113, "grad_norm": 3.977481472662859, "kl": 0.2880859375, "learning_rate": 8.745362563237774e-07, "loss": 0.0003, "reward": 3.485430121421814, "reward_std": 0.08592750132083893, "rewards/final_reward": 1.8542197972391161, "rewards/mask_iou_reward": 0.9271098986195581, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.485430121421814, "rewards/thk_ans_format_reward": 1.0, "step": 372, "think_completion_length": 64.84375 }, { "clip_ratio": 0.0, "completion_length": 144.0, "epoch": 0.6290050590219224, "grad_norm": 11.672200901434106, "kl": 0.23095703125, "learning_rate": 8.741989881956154e-07, "loss": 0.0002, "reward": 3.1509275436401367, "reward_std": 0.3214876800775528, "rewards/final_reward": 1.6934902014125646, "rewards/mask_iou_reward": 0.8467451007062823, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1509276628494263, "rewards/thk_ans_format_reward": 1.0, "step": 373, "think_completion_length": 69.3125 }, { "clip_ratio": 0.0, "completion_length": 131.75, "epoch": 0.6306913996627319, "grad_norm": 6.7196830138170345, "kl": 0.2705078125, "learning_rate": 8.738617200674536e-07, "loss": 0.0003, "reward": 3.345046043395996, "reward_std": 0.24192753434181213, "rewards/final_reward": 1.1307378696874646, "rewards/mask_iou_reward": 0.5653689348437323, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3450458347797394, "rewards/thk_ans_format_reward": 1.0, "step": 374, "think_completion_length": 64.25 }, { "clip_ratio": 0.0, "completion_length": 172.90625, "epoch": 0.6323777403035413, "grad_norm": 6.715116261073944, "kl": 0.2353515625, "learning_rate": 8.735244519392917e-07, "loss": 0.0002, "reward": 2.55766224861145, "reward_std": 0.07771342247724533, "rewards/final_reward": 0.9295640305047778, "rewards/mask_iou_reward": 0.4647820152523889, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5576622486114502, "rewards/thk_ans_format_reward": 1.0, "step": 375, "think_completion_length": 59.1875 }, { "clip_ratio": 0.0, "completion_length": 133.671875, "epoch": 0.6340640809443507, "grad_norm": 5.350210243493229, "kl": 0.25146484375, "learning_rate": 8.731871838111298e-07, "loss": 0.0003, "reward": 3.4477986097335815, "reward_std": 0.20711440779268742, "rewards/final_reward": 1.2917363954773742, "rewards/mask_iou_reward": 0.6458681977386871, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4477986097335815, "rewards/thk_ans_format_reward": 1.0, "step": 376, "think_completion_length": 59.25 }, { "clip_ratio": 0.0, "completion_length": 137.75, "epoch": 0.6357504215851602, "grad_norm": 13.855411935365716, "kl": 0.263671875, "learning_rate": 8.728499156829679e-07, "loss": 0.0003, "reward": 3.2779818773269653, "reward_std": 0.5463913530111313, "rewards/final_reward": 0.9221951530177666, "rewards/mask_iou_reward": 0.4610975765088833, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2779819965362549, "rewards/thk_ans_format_reward": 1.0, "step": 377, "think_completion_length": 75.40625 }, { "clip_ratio": 0.0, "completion_length": 143.28125, "epoch": 0.6374367622259697, "grad_norm": 15.4353365340289, "kl": 0.28125, "learning_rate": 8.72512647554806e-07, "loss": 0.0003, "reward": 2.8729852437973022, "reward_std": 0.11946488171815872, "rewards/final_reward": 1.374176703995387, "rewards/mask_iou_reward": 0.6870883519976935, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.872985303401947, "rewards/thk_ans_format_reward": 1.0, "step": 378, "think_completion_length": 60.21875 }, { "clip_ratio": 0.0, "completion_length": 128.5, "epoch": 0.6391231028667791, "grad_norm": 6.330254599055379, "kl": 0.2646484375, "learning_rate": 8.721753794266441e-07, "loss": 0.0003, "reward": 3.4926542043685913, "reward_std": 0.360406506806612, "rewards/final_reward": 1.8171903327357386, "rewards/mask_iou_reward": 0.9085951663678693, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4926542043685913, "rewards/thk_ans_format_reward": 1.0, "step": 379, "think_completion_length": 58.65625 }, { "clip_ratio": 0.0, "completion_length": 122.625, "epoch": 0.6408094435075885, "grad_norm": 8.281536520621378, "kl": 0.359375, "learning_rate": 8.718381112984823e-07, "loss": 0.0004, "reward": 2.885327696800232, "reward_std": 0.38757482171058655, "rewards/final_reward": 1.036380061901672, "rewards/mask_iou_reward": 0.518190030950836, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9009527862071991, "rewards/thk_ans_format_reward": 0.984375, "step": 380, "think_completion_length": 53.28125 }, { "clip_ratio": 0.0, "completion_length": 135.78125, "epoch": 0.642495784148398, "grad_norm": 4.035395909526506, "kl": 0.2958984375, "learning_rate": 8.715008431703204e-07, "loss": 0.0003, "reward": 2.560006260871887, "reward_std": 0.23123303055763245, "rewards/final_reward": 0.8636089485805896, "rewards/mask_iou_reward": 0.4318044742902948, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.575631245970726, "rewards/thk_ans_format_reward": 0.984375, "step": 381, "think_completion_length": 71.5625 }, { "clip_ratio": 0.0, "completion_length": 136.953125, "epoch": 0.6441821247892074, "grad_norm": 10.783252150736887, "kl": 0.2900390625, "learning_rate": 8.711635750421584e-07, "loss": 0.0003, "reward": 3.132146716117859, "reward_std": 0.28874067962169647, "rewards/final_reward": 0.9155655076835651, "rewards/mask_iou_reward": 0.4577827538417826, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1321466565132141, "rewards/thk_ans_format_reward": 1.0, "step": 382, "think_completion_length": 62.0625 }, { "clip_ratio": 0.0, "completion_length": 147.703125, "epoch": 0.6458684654300169, "grad_norm": 11.402565008509699, "kl": 0.263671875, "learning_rate": 8.708263069139966e-07, "loss": 0.0003, "reward": 3.3465912342071533, "reward_std": 0.4565277546644211, "rewards/final_reward": 1.5629399880976687, "rewards/mask_iou_reward": 0.7814699940488343, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3465911149978638, "rewards/thk_ans_format_reward": 1.0, "step": 383, "think_completion_length": 57.40625 }, { "clip_ratio": 0.0, "completion_length": 145.375, "epoch": 0.6475548060708263, "grad_norm": 4.795135202730111, "kl": 0.3251953125, "learning_rate": 8.704890387858347e-07, "loss": 0.0003, "reward": 3.0373746156692505, "reward_std": 0.33877818286418915, "rewards/final_reward": 1.425423519540372, "rewards/mask_iou_reward": 0.712711759770186, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0373746454715729, "rewards/thk_ans_format_reward": 1.0, "step": 384, "think_completion_length": 56.125 }, { "clip_ratio": 0.0, "completion_length": 188.953125, "epoch": 0.6492411467116358, "grad_norm": 7.6054679003491, "kl": 0.2783203125, "learning_rate": 8.701517706576727e-07, "loss": 0.0003, "reward": 3.0235763788223267, "reward_std": 0.34590520709753036, "rewards/final_reward": 0.6524966898611744, "rewards/mask_iou_reward": 0.3262483449305872, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0235763490200043, "rewards/thk_ans_format_reward": 1.0, "step": 385, "think_completion_length": 61.65625 }, { "clip_ratio": 0.0, "completion_length": 143.375, "epoch": 0.6509274873524452, "grad_norm": 7.642619901632982, "kl": 0.26171875, "learning_rate": 8.698145025295109e-07, "loss": 0.0003, "reward": 3.2443251609802246, "reward_std": 0.15900463983416557, "rewards/final_reward": 1.1955481569676252, "rewards/mask_iou_reward": 0.5977740784838126, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.244325041770935, "rewards/thk_ans_format_reward": 1.0, "step": 386, "think_completion_length": 63.5625 }, { "clip_ratio": 0.0, "completion_length": 140.359375, "epoch": 0.6526138279932546, "grad_norm": 5.098340106768318, "kl": 0.3203125, "learning_rate": 8.69477234401349e-07, "loss": 0.0006, "reward": 2.9133780002593994, "reward_std": 0.10066283494234085, "rewards/final_reward": 0.13628261847652232, "rewards/mask_iou_reward": 0.06814130923826116, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9133780598640442, "rewards/thk_ans_format_reward": 1.0, "step": 387, "think_completion_length": 59.375 }, { "clip_ratio": 0.0, "completion_length": 133.25, "epoch": 0.654300168634064, "grad_norm": 4.621168421511462, "kl": 0.38671875, "learning_rate": 8.691399662731872e-07, "loss": 0.0004, "reward": 3.448104739189148, "reward_std": 0.46191219985485077, "rewards/final_reward": 1.5218526491931632, "rewards/mask_iou_reward": 0.7609263245965816, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4481046795845032, "rewards/thk_ans_format_reward": 1.0, "step": 388, "think_completion_length": 56.6875 }, { "clip_ratio": 0.0, "completion_length": 146.359375, "epoch": 0.6559865092748736, "grad_norm": 5.379842188826864, "kl": 0.421875, "learning_rate": 8.688026981450253e-07, "loss": 0.0004, "reward": 3.3314003944396973, "reward_std": 0.1324574500322342, "rewards/final_reward": 1.4115282221098315, "rewards/mask_iou_reward": 0.7057641110549158, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3314002752304077, "rewards/thk_ans_format_reward": 1.0, "step": 389, "think_completion_length": 65.0625 }, { "clip_ratio": 0.0, "completion_length": 149.984375, "epoch": 0.657672849915683, "grad_norm": 8.721876673645482, "kl": 0.3671875, "learning_rate": 8.684654300168634e-07, "loss": 0.0004, "reward": 3.0254807472229004, "reward_std": 0.3890673518180847, "rewards/final_reward": 0.6676522325720965, "rewards/mask_iou_reward": 0.33382611628604825, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0254807472229004, "rewards/thk_ans_format_reward": 1.0, "step": 390, "think_completion_length": 72.40625 }, { "clip_ratio": 0.0, "completion_length": 129.40625, "epoch": 0.6593591905564924, "grad_norm": 6.60492896849073, "kl": 0.2958984375, "learning_rate": 8.681281618887015e-07, "loss": 0.0003, "reward": 3.555801510810852, "reward_std": 0.3658871501684189, "rewards/final_reward": 1.5090375076129487, "rewards/mask_iou_reward": 0.7545187538064744, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5558015704154968, "rewards/thk_ans_format_reward": 1.0, "step": 391, "think_completion_length": 56.96875 }, { "clip_ratio": 0.0, "completion_length": 148.28125, "epoch": 0.6610455311973018, "grad_norm": 11.714815873295858, "kl": 0.271484375, "learning_rate": 8.677908937605396e-07, "loss": 0.0003, "reward": 2.6682028770446777, "reward_std": 0.12304145842790604, "rewards/final_reward": 0.7471882303933259, "rewards/mask_iou_reward": 0.37359411519666297, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.668202817440033, "rewards/thk_ans_format_reward": 1.0, "step": 392, "think_completion_length": 71.03125 }, { "clip_ratio": 0.0, "completion_length": 144.375, "epoch": 0.6627318718381113, "grad_norm": 6.683431729410963, "kl": 0.2978515625, "learning_rate": 8.674536256323777e-07, "loss": 0.0003, "reward": 3.397181987762451, "reward_std": 0.27734769880771637, "rewards/final_reward": 1.081036873280966, "rewards/mask_iou_reward": 0.540518436640483, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3971819579601288, "rewards/thk_ans_format_reward": 1.0, "step": 393, "think_completion_length": 66.59375 }, { "clip_ratio": 0.0, "completion_length": 152.109375, "epoch": 0.6644182124789207, "grad_norm": 12.546590991266813, "kl": 0.296875, "learning_rate": 8.671163575042158e-07, "loss": 0.0003, "reward": 2.7042452096939087, "reward_std": 0.265919953584671, "rewards/final_reward": 0.6445974072533363, "rewards/mask_iou_reward": 0.3222987036266681, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7042451202869415, "rewards/thk_ans_format_reward": 1.0, "step": 394, "think_completion_length": 74.375 }, { "clip_ratio": 0.0, "completion_length": 163.515625, "epoch": 0.6661045531197302, "grad_norm": 9.094193873759082, "kl": 0.23046875, "learning_rate": 8.667790893760539e-07, "loss": 0.0002, "reward": 3.18087375164032, "reward_std": 0.10219046473503113, "rewards/final_reward": 1.65937009926742, "rewards/mask_iou_reward": 0.82968504963371, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1808739304542542, "rewards/thk_ans_format_reward": 1.0, "step": 395, "think_completion_length": 65.25 }, { "clip_ratio": 0.0, "completion_length": 133.03125, "epoch": 0.6677908937605397, "grad_norm": 8.923928936464332, "kl": 0.306640625, "learning_rate": 8.66441821247892e-07, "loss": 0.0003, "reward": 3.333390235900879, "reward_std": 0.10420708172023296, "rewards/final_reward": 1.1318658920698947, "rewards/mask_iou_reward": 0.5659329460349474, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3333901762962341, "rewards/thk_ans_format_reward": 1.0, "step": 396, "think_completion_length": 63.875 }, { "clip_ratio": 0.0, "completion_length": 162.015625, "epoch": 0.6694772344013491, "grad_norm": 10.855733134185414, "kl": 0.28466796875, "learning_rate": 8.661045531197302e-07, "loss": 0.0003, "reward": 2.7989786863327026, "reward_std": 0.48467782139778137, "rewards/final_reward": 0.7798602489754165, "rewards/mask_iou_reward": 0.38993012448770825, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7989786863327026, "rewards/thk_ans_format_reward": 1.0, "step": 397, "think_completion_length": 72.03125 }, { "clip_ratio": 0.0, "completion_length": 152.703125, "epoch": 0.6711635750421585, "grad_norm": 8.498372368415978, "kl": 0.2392578125, "learning_rate": 8.657672849915683e-07, "loss": 0.0002, "reward": 3.2104252576828003, "reward_std": 0.3338836580514908, "rewards/final_reward": 1.384165478226141, "rewards/mask_iou_reward": 0.6920827391130705, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2104252576828003, "rewards/thk_ans_format_reward": 1.0, "step": 398, "think_completion_length": 55.40625 }, { "clip_ratio": 0.0, "completion_length": 138.15625, "epoch": 0.6728499156829679, "grad_norm": 4.168354530926599, "kl": 0.3369140625, "learning_rate": 8.654300168634064e-07, "loss": 0.0003, "reward": 3.360267162322998, "reward_std": 0.27631398290395737, "rewards/final_reward": 1.3226066492302255, "rewards/mask_iou_reward": 0.6613033246151128, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.360267162322998, "rewards/thk_ans_format_reward": 1.0, "step": 399, "think_completion_length": 73.28125 }, { "clip_ratio": 0.0, "completion_length": 154.71875, "epoch": 0.6745362563237775, "grad_norm": 6.109202011063787, "kl": 0.2451171875, "learning_rate": 8.650927487352445e-07, "loss": 0.0002, "reward": 2.7300503253936768, "reward_std": 0.3722390979528427, "rewards/final_reward": 1.022374351321346, "rewards/mask_iou_reward": 0.511187175660673, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7300503700971603, "rewards/thk_ans_format_reward": 1.0, "step": 400, "think_completion_length": 70.0625 }, { "clip_ratio": 0.0, "completion_length": 142.3125, "epoch": 0.6762225969645869, "grad_norm": 4.023558836956739, "kl": 0.2392578125, "learning_rate": 8.647554806070826e-07, "loss": 0.0002, "reward": 2.928010106086731, "reward_std": 0.26283153891563416, "rewards/final_reward": 1.1748140838127776, "rewards/mask_iou_reward": 0.5874070419063888, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.928010106086731, "rewards/thk_ans_format_reward": 1.0, "step": 401, "think_completion_length": 65.5625 }, { "clip_ratio": 0.0, "completion_length": 132.984375, "epoch": 0.6779089376053963, "grad_norm": 3.6277340907732945, "kl": 0.2958984375, "learning_rate": 8.644182124789206e-07, "loss": 0.0003, "reward": 2.82840895652771, "reward_std": 0.18483292683959007, "rewards/final_reward": 0.09071974080914268, "rewards/mask_iou_reward": 0.04535987040457134, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8284089267253876, "rewards/thk_ans_format_reward": 1.0, "step": 402, "think_completion_length": 66.5 }, { "clip_ratio": 0.0, "completion_length": 162.34375, "epoch": 0.6795952782462057, "grad_norm": 5.293796217517415, "kl": 0.26904296875, "learning_rate": 8.640809443507588e-07, "loss": 0.0003, "reward": 3.4878629446029663, "reward_std": 0.1448333915323019, "rewards/final_reward": 1.632881523088085, "rewards/mask_iou_reward": 0.8164407615440425, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4878630638122559, "rewards/thk_ans_format_reward": 1.0, "step": 403, "think_completion_length": 58.71875 }, { "clip_ratio": 0.0, "completion_length": 128.90625, "epoch": 0.6812816188870152, "grad_norm": 4.0214023431676615, "kl": 0.298828125, "learning_rate": 8.637436762225969e-07, "loss": 0.0003, "reward": 3.0970970392227173, "reward_std": 0.24998241756111383, "rewards/final_reward": 1.3948014535785171, "rewards/mask_iou_reward": 0.6974007267892586, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.097097098827362, "rewards/thk_ans_format_reward": 1.0, "step": 404, "think_completion_length": 64.4375 }, { "clip_ratio": 0.0, "completion_length": 140.703125, "epoch": 0.6829679595278246, "grad_norm": 6.991066868890862, "kl": 0.259765625, "learning_rate": 8.63406408094435e-07, "loss": 0.0003, "reward": 2.830276131629944, "reward_std": 0.557792603969574, "rewards/final_reward": 0.96004537767179, "rewards/mask_iou_reward": 0.480022688835895, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8302761018276215, "rewards/thk_ans_format_reward": 1.0, "step": 405, "think_completion_length": 64.0 }, { "clip_ratio": 0.0, "completion_length": 143.9375, "epoch": 0.684654300168634, "grad_norm": 14.478623985464413, "kl": 0.2919921875, "learning_rate": 8.630691399662732e-07, "loss": 0.0003, "reward": 2.990003228187561, "reward_std": 0.2519157975912094, "rewards/final_reward": 0.8406945511209645, "rewards/mask_iou_reward": 0.42034727556048224, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.990003228187561, "rewards/thk_ans_format_reward": 1.0, "step": 406, "think_completion_length": 59.5 }, { "clip_ratio": 0.0, "completion_length": 162.234375, "epoch": 0.6863406408094435, "grad_norm": 4.662416823922717, "kl": 0.431640625, "learning_rate": 8.627318718381113e-07, "loss": 0.0004, "reward": 2.8567984104156494, "reward_std": 0.23610374331474304, "rewards/final_reward": 0.6864285578802869, "rewards/mask_iou_reward": 0.34321427894014345, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.856798380613327, "rewards/thk_ans_format_reward": 1.0, "step": 407, "think_completion_length": 67.375 }, { "clip_ratio": 0.0, "completion_length": 136.78125, "epoch": 0.688026981450253, "grad_norm": 3.5185540402928988, "kl": 0.314453125, "learning_rate": 8.623946037099494e-07, "loss": 0.0003, "reward": 3.2333940267562866, "reward_std": 0.27213188260793686, "rewards/final_reward": 1.3256075781426653, "rewards/mask_iou_reward": 0.6628037890713326, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2333939671516418, "rewards/thk_ans_format_reward": 1.0, "step": 408, "think_completion_length": 65.84375 }, { "clip_ratio": 0.0, "completion_length": 146.921875, "epoch": 0.6897133220910624, "grad_norm": 58.126162154558536, "kl": 0.2548828125, "learning_rate": 8.620573355817875e-07, "loss": 0.0002, "reward": 3.4215975999832153, "reward_std": 0.2836841717362404, "rewards/final_reward": 1.3137488698038982, "rewards/mask_iou_reward": 0.6568744349019491, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4215975999832153, "rewards/thk_ans_format_reward": 1.0, "step": 409, "think_completion_length": 52.25 }, { "clip_ratio": 0.0, "completion_length": 151.390625, "epoch": 0.6913996627318718, "grad_norm": 4.944433841842423, "kl": 0.251953125, "learning_rate": 8.617200674536255e-07, "loss": 0.0003, "reward": 3.0755574703216553, "reward_std": 0.3461499884724617, "rewards/final_reward": 1.0501447498455052, "rewards/mask_iou_reward": 0.5250723749227526, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0755574703216553, "rewards/thk_ans_format_reward": 1.0, "step": 410, "think_completion_length": 62.15625 }, { "clip_ratio": 0.0, "completion_length": 129.109375, "epoch": 0.6930860033726813, "grad_norm": 22.612041850982212, "kl": 0.28125, "learning_rate": 8.613827993254636e-07, "loss": 0.0003, "reward": 3.604109525680542, "reward_std": 0.19016021490097046, "rewards/final_reward": 1.7495145669154235, "rewards/mask_iou_reward": 0.8747572834577118, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6041094660758972, "rewards/thk_ans_format_reward": 1.0, "step": 411, "think_completion_length": 62.875 }, { "clip_ratio": 0.0, "completion_length": 129.359375, "epoch": 0.6947723440134908, "grad_norm": 7.973198624210423, "kl": 0.24951171875, "learning_rate": 8.610455311973018e-07, "loss": 0.0002, "reward": 3.139096975326538, "reward_std": 0.1365029662847519, "rewards/final_reward": 0.7252312308037943, "rewards/mask_iou_reward": 0.36261561540189713, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1390970349311829, "rewards/thk_ans_format_reward": 1.0, "step": 412, "think_completion_length": 58.71875 }, { "clip_ratio": 0.0, "completion_length": 147.875, "epoch": 0.6964586846543002, "grad_norm": 9.023858103950877, "kl": 0.2607421875, "learning_rate": 8.607082630691399e-07, "loss": 0.0003, "reward": 3.077267289161682, "reward_std": 0.08271101489663124, "rewards/final_reward": 1.5952865266384522, "rewards/mask_iou_reward": 0.7976432633192261, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0772673785686493, "rewards/thk_ans_format_reward": 1.0, "step": 413, "think_completion_length": 65.0625 }, { "clip_ratio": 0.0, "completion_length": 125.71875, "epoch": 0.6981450252951096, "grad_norm": 5.158677504548988, "kl": 0.2666015625, "learning_rate": 8.603709949409781e-07, "loss": 0.0003, "reward": 2.795508623123169, "reward_std": 0.0809487490914762, "rewards/final_reward": 0.917479949771516, "rewards/mask_iou_reward": 0.458739974885758, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7955086827278137, "rewards/thk_ans_format_reward": 1.0, "step": 414, "think_completion_length": 54.4375 }, { "clip_ratio": 0.0, "completion_length": 123.5, "epoch": 0.6998313659359191, "grad_norm": 5.030120907805138, "kl": 0.3251953125, "learning_rate": 8.600337268128162e-07, "loss": 0.0003, "reward": 3.044167995452881, "reward_std": 0.06912581558572128, "rewards/final_reward": 1.9550748572924652, "rewards/mask_iou_reward": 0.9775374286462326, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0441679954528809, "rewards/thk_ans_format_reward": 1.0, "step": 415, "think_completion_length": 53.75 }, { "clip_ratio": 0.0, "completion_length": 121.03125, "epoch": 0.7015177065767285, "grad_norm": 10.346031417599695, "kl": 0.294921875, "learning_rate": 8.596964586846543e-07, "loss": 0.0003, "reward": 3.1020259857177734, "reward_std": 0.1669246181845665, "rewards/final_reward": 1.4006585781103693, "rewards/mask_iou_reward": 0.7003292890551847, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.102025881409645, "rewards/thk_ans_format_reward": 1.0, "step": 416, "think_completion_length": 50.96875 }, { "clip_ratio": 0.0, "completion_length": 128.5, "epoch": 0.7032040472175379, "grad_norm": 5.559416835383098, "kl": 0.267578125, "learning_rate": 8.593591905564925e-07, "loss": 0.0003, "reward": 2.8304479122161865, "reward_std": 0.1307654045522213, "rewards/final_reward": 0.2791251550299971, "rewards/mask_iou_reward": 0.13956257751499854, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8304478824138641, "rewards/thk_ans_format_reward": 1.0, "step": 417, "think_completion_length": 62.4375 }, { "clip_ratio": 0.0, "completion_length": 127.765625, "epoch": 0.7048903878583473, "grad_norm": 16.700794405142094, "kl": 0.271484375, "learning_rate": 8.590219224283305e-07, "loss": 0.0003, "reward": 2.828887462615967, "reward_std": 0.2920425906777382, "rewards/final_reward": 0.5944796988166807, "rewards/mask_iou_reward": 0.29723984940834036, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8288873136043549, "rewards/thk_ans_format_reward": 1.0, "step": 418, "think_completion_length": 60.125 }, { "clip_ratio": 0.0, "completion_length": 159.625, "epoch": 0.7065767284991569, "grad_norm": 5.229023826318465, "kl": 0.287109375, "learning_rate": 8.586846543001685e-07, "loss": 0.0003, "reward": 2.6701961755752563, "reward_std": 0.5146700888872147, "rewards/final_reward": 0.768534040115647, "rewards/mask_iou_reward": 0.3842670200578235, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.701446145772934, "rewards/thk_ans_format_reward": 0.984375, "step": 419, "think_completion_length": 55.59375 }, { "clip_ratio": 0.0, "completion_length": 179.03125, "epoch": 0.7082630691399663, "grad_norm": 4.366390117260031, "kl": 0.2724609375, "learning_rate": 8.583473861720067e-07, "loss": 0.0003, "reward": 2.7694051265716553, "reward_std": 0.35564553551375866, "rewards/final_reward": 0.7180693794692734, "rewards/mask_iou_reward": 0.3590346897346367, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 0.8319052159786224, "rewards/thk_ans_format_reward": 0.96875, "step": 420, "think_completion_length": 61.40625 }, { "clip_ratio": 0.0, "completion_length": 160.5625, "epoch": 0.7099494097807757, "grad_norm": 8.26275894078137, "kl": 0.28515625, "learning_rate": 8.580101180438448e-07, "loss": 0.0003, "reward": 3.165099620819092, "reward_std": 0.32927022874355316, "rewards/final_reward": 0.9518278358961747, "rewards/mask_iou_reward": 0.47591391794808735, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1650997400283813, "rewards/thk_ans_format_reward": 1.0, "step": 421, "think_completion_length": 55.90625 }, { "clip_ratio": 0.0, "completion_length": 158.328125, "epoch": 0.7116357504215851, "grad_norm": 4.796511312898097, "kl": 0.3564453125, "learning_rate": 8.576728499156829e-07, "loss": 0.0004, "reward": 2.997436285018921, "reward_std": 0.3313639760017395, "rewards/final_reward": 0.5018222892159665, "rewards/mask_iou_reward": 0.25091114460798325, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9974363744258881, "rewards/thk_ans_format_reward": 1.0, "step": 422, "think_completion_length": 58.875 }, { "clip_ratio": 0.0, "completion_length": 128.71875, "epoch": 0.7133220910623946, "grad_norm": 17.89257487840126, "kl": 0.29296875, "learning_rate": 8.573355817875211e-07, "loss": 0.0003, "reward": 2.97784960269928, "reward_std": 0.34186942130327225, "rewards/final_reward": 1.1728364824516073, "rewards/mask_iou_reward": 0.5864182412258037, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9778496026992798, "rewards/thk_ans_format_reward": 1.0, "step": 423, "think_completion_length": 56.375 }, { "clip_ratio": 0.0, "completion_length": 132.890625, "epoch": 0.715008431703204, "grad_norm": 27.214424601047234, "kl": 0.294921875, "learning_rate": 8.569983136593592e-07, "loss": 0.0003, "reward": 2.7833904027938843, "reward_std": 0.28692834824323654, "rewards/final_reward": 0.9084582272270576, "rewards/mask_iou_reward": 0.4542291136135288, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7833903729915619, "rewards/thk_ans_format_reward": 1.0, "step": 424, "think_completion_length": 56.875 }, { "clip_ratio": 0.0, "completion_length": 126.8125, "epoch": 0.7166947723440135, "grad_norm": 40.38312340410711, "kl": 0.33984375, "learning_rate": 8.566610455311973e-07, "loss": 0.0003, "reward": 2.905064821243286, "reward_std": 0.1572525054216385, "rewards/final_reward": 1.1837076841446386, "rewards/mask_iou_reward": 0.5918538420723193, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9050647914409637, "rewards/thk_ans_format_reward": 1.0, "step": 425, "think_completion_length": 64.03125 }, { "clip_ratio": 0.0, "completion_length": 119.953125, "epoch": 0.718381112984823, "grad_norm": 7.976773584921408, "kl": 0.6875, "learning_rate": 8.563237774030355e-07, "loss": 0.0007, "reward": 3.05864155292511, "reward_std": 0.3007048964500427, "rewards/final_reward": 1.1670635620638414, "rewards/mask_iou_reward": 0.5835317810319207, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.058641493320465, "rewards/thk_ans_format_reward": 1.0, "step": 426, "think_completion_length": 60.375 }, { "clip_ratio": 0.0, "completion_length": 129.296875, "epoch": 0.7200674536256324, "grad_norm": 8.273608489018805, "kl": 0.29296875, "learning_rate": 8.559865092748734e-07, "loss": 0.0003, "reward": 2.8656177520751953, "reward_std": 0.36590053141117096, "rewards/final_reward": 0.5663732712946641, "rewards/mask_iou_reward": 0.28318663564733204, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8656177222728729, "rewards/thk_ans_format_reward": 1.0, "step": 427, "think_completion_length": 53.75 }, { "clip_ratio": 0.0, "completion_length": 114.65625, "epoch": 0.7217537942664418, "grad_norm": 9.39576308947908, "kl": 0.2861328125, "learning_rate": 8.556492411467115e-07, "loss": 0.0003, "reward": 2.8028112649917603, "reward_std": 0.22308364510536194, "rewards/final_reward": 1.1169452408649283, "rewards/mask_iou_reward": 0.5584726204324642, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8028113842010498, "rewards/thk_ans_format_reward": 1.0, "step": 428, "think_completion_length": 49.46875 }, { "clip_ratio": 0.0, "completion_length": 124.59375, "epoch": 0.7234401349072512, "grad_norm": 5.036837067527288, "kl": 0.30859375, "learning_rate": 8.553119730185497e-07, "loss": 0.0003, "reward": 3.52843701839447, "reward_std": 0.2567114308476448, "rewards/final_reward": 1.6959970281876124, "rewards/mask_iou_reward": 0.8479985140938062, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5284370183944702, "rewards/thk_ans_format_reward": 1.0, "step": 429, "think_completion_length": 55.25 }, { "clip_ratio": 0.0, "completion_length": 133.671875, "epoch": 0.7251264755480608, "grad_norm": 8.45011833698191, "kl": 0.294921875, "learning_rate": 8.549747048903878e-07, "loss": 0.0003, "reward": 2.8881938457489014, "reward_std": 0.11123907007277012, "rewards/final_reward": 0.5909519443980289, "rewards/mask_iou_reward": 0.29547597219901445, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8881938755512238, "rewards/thk_ans_format_reward": 1.0, "step": 430, "think_completion_length": 62.1875 }, { "clip_ratio": 0.0, "completion_length": 128.5625, "epoch": 0.7268128161888702, "grad_norm": 5.126242185281033, "kl": 0.3310546875, "learning_rate": 8.546374367622259e-07, "loss": 0.0003, "reward": 3.2686800956726074, "reward_std": 0.47582103312015533, "rewards/final_reward": 1.1995088677204089, "rewards/mask_iou_reward": 0.5997544338602044, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.268679916858673, "rewards/thk_ans_format_reward": 1.0, "step": 431, "think_completion_length": 56.125 }, { "clip_ratio": 0.0, "completion_length": 126.328125, "epoch": 0.7284991568296796, "grad_norm": 7.510954485318972, "kl": 0.3037109375, "learning_rate": 8.543001686340641e-07, "loss": 0.0003, "reward": 2.963119626045227, "reward_std": 0.5091882646083832, "rewards/final_reward": 1.1603252698717035, "rewards/mask_iou_reward": 0.5801626349358517, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9631196856498718, "rewards/thk_ans_format_reward": 1.0, "step": 432, "think_completion_length": 53.59375 }, { "clip_ratio": 0.0, "completion_length": 159.875, "epoch": 0.730185497470489, "grad_norm": 8.17404395724238, "kl": 0.2451171875, "learning_rate": 8.539629005059022e-07, "loss": 0.0002, "reward": 3.3582974672317505, "reward_std": 0.27676521986722946, "rewards/final_reward": 1.060991388104146, "rewards/mask_iou_reward": 0.530495694052073, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.358297348022461, "rewards/thk_ans_format_reward": 1.0, "step": 433, "think_completion_length": 48.84375 }, { "clip_ratio": 0.0, "completion_length": 134.921875, "epoch": 0.7318718381112985, "grad_norm": 5.1275072969602675, "kl": 0.2978515625, "learning_rate": 8.536256323777403e-07, "loss": 0.0003, "reward": 3.2699029445648193, "reward_std": 0.12833164259791374, "rewards/final_reward": 1.371142408100008, "rewards/mask_iou_reward": 0.685571204050004, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.269902914762497, "rewards/thk_ans_format_reward": 1.0, "step": 434, "think_completion_length": 63.25 }, { "clip_ratio": 0.0, "completion_length": 124.40625, "epoch": 0.7335581787521079, "grad_norm": 8.844781927351123, "kl": 0.2880859375, "learning_rate": 8.532883642495783e-07, "loss": 0.0005, "reward": 3.6087805032730103, "reward_std": 0.2516215443611145, "rewards/final_reward": 1.46631224588285, "rewards/mask_iou_reward": 0.733156122941425, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6087803840637207, "rewards/thk_ans_format_reward": 1.0, "step": 435, "think_completion_length": 58.0625 }, { "clip_ratio": 0.0, "completion_length": 123.84375, "epoch": 0.7352445193929174, "grad_norm": 9.045015551736409, "kl": 0.283203125, "learning_rate": 8.529510961214164e-07, "loss": 0.0003, "reward": 3.1191418170928955, "reward_std": 0.17643820121884346, "rewards/final_reward": 0.5086317075081106, "rewards/mask_iou_reward": 0.2543158537540553, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1191417574882507, "rewards/thk_ans_format_reward": 1.0, "step": 436, "think_completion_length": 55.09375 }, { "clip_ratio": 0.0, "completion_length": 123.25, "epoch": 0.7369308600337268, "grad_norm": 8.044272316503301, "kl": 0.322265625, "learning_rate": 8.526138279932546e-07, "loss": 0.0003, "reward": 2.923910140991211, "reward_std": 0.2595134302973747, "rewards/final_reward": 0.39993271395890284, "rewards/mask_iou_reward": 0.19996635697945142, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9239100217819214, "rewards/thk_ans_format_reward": 1.0, "step": 437, "think_completion_length": 54.5625 }, { "clip_ratio": 0.0, "completion_length": 142.1875, "epoch": 0.7386172006745363, "grad_norm": 6.42343047551497, "kl": 0.2958984375, "learning_rate": 8.522765598650927e-07, "loss": 0.0003, "reward": 3.2516669034957886, "reward_std": 0.3042123168706894, "rewards/final_reward": 1.4425390635201096, "rewards/mask_iou_reward": 0.7212695317600548, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2516670227050781, "rewards/thk_ans_format_reward": 1.0, "step": 438, "think_completion_length": 58.09375 }, { "clip_ratio": 0.0, "completion_length": 153.3125, "epoch": 0.7403035413153457, "grad_norm": 4.765939573950677, "kl": 0.2431640625, "learning_rate": 8.519392917369308e-07, "loss": 0.0002, "reward": 2.7552138566970825, "reward_std": 0.29110707342624664, "rewards/final_reward": 0.7833496236061104, "rewards/mask_iou_reward": 0.3916748118030552, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7552138268947601, "rewards/thk_ans_format_reward": 1.0, "step": 439, "think_completion_length": 56.78125 }, { "clip_ratio": 0.0, "completion_length": 127.4375, "epoch": 0.7419898819561551, "grad_norm": 6.901156078063723, "kl": 0.259765625, "learning_rate": 8.51602023608769e-07, "loss": 0.0003, "reward": 3.238754630088806, "reward_std": 0.36059945821762085, "rewards/final_reward": 1.6652710435028883, "rewards/mask_iou_reward": 0.8326355217514442, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.2543795108795166, "rewards/thk_ans_format_reward": 1.0, "step": 440, "think_completion_length": 61.875 }, { "clip_ratio": 0.0, "completion_length": 120.890625, "epoch": 0.7436762225969646, "grad_norm": 9.619707399796134, "kl": 0.3603515625, "learning_rate": 8.512647554806071e-07, "loss": 0.0004, "reward": 3.349504232406616, "reward_std": 0.09203607961535454, "rewards/final_reward": 1.811421625096705, "rewards/mask_iou_reward": 0.9057108125483525, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3495042622089386, "rewards/thk_ans_format_reward": 1.0, "step": 441, "think_completion_length": 52.125 }, { "clip_ratio": 0.0, "completion_length": 115.59375, "epoch": 0.7453625632377741, "grad_norm": 7.184268284891987, "kl": 0.3095703125, "learning_rate": 8.509274873524452e-07, "loss": 0.0003, "reward": 3.019321084022522, "reward_std": 0.3987215608358383, "rewards/final_reward": 0.8732108529144026, "rewards/mask_iou_reward": 0.4366054264572013, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0193210542201996, "rewards/thk_ans_format_reward": 1.0, "step": 442, "think_completion_length": 47.875 }, { "clip_ratio": 0.0, "completion_length": 132.015625, "epoch": 0.7470489038785835, "grad_norm": 12.515671581951322, "kl": 0.326171875, "learning_rate": 8.505902192242834e-07, "loss": 0.0003, "reward": 3.155945658683777, "reward_std": 0.34589822590351105, "rewards/final_reward": 1.1646884123020445, "rewards/mask_iou_reward": 0.5823442061510222, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1559456586837769, "rewards/thk_ans_format_reward": 1.0, "step": 443, "think_completion_length": 59.96875 }, { "clip_ratio": 0.0, "completion_length": 132.921875, "epoch": 0.7487352445193929, "grad_norm": 3.547481875341391, "kl": 0.287109375, "learning_rate": 8.502529510961213e-07, "loss": 0.0003, "reward": 2.903050661087036, "reward_std": 0.07714477553963661, "rewards/final_reward": 1.0310059078481788, "rewards/mask_iou_reward": 0.5155029539240894, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9030508100986481, "rewards/thk_ans_format_reward": 1.0, "step": 444, "think_completion_length": 52.34375 }, { "clip_ratio": 0.0, "completion_length": 131.1875, "epoch": 0.7504215851602024, "grad_norm": 9.083111303607465, "kl": 0.298828125, "learning_rate": 8.499156829679594e-07, "loss": 0.0003, "reward": 3.021065354347229, "reward_std": 0.15533466637134552, "rewards/final_reward": 1.4759822302666619, "rewards/mask_iou_reward": 0.7379911151333309, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.021065354347229, "rewards/thk_ans_format_reward": 1.0, "step": 445, "think_completion_length": 58.90625 }, { "clip_ratio": 0.0, "completion_length": 135.359375, "epoch": 0.7521079258010118, "grad_norm": 7.087376979581363, "kl": 0.26171875, "learning_rate": 8.495784148397976e-07, "loss": 0.0003, "reward": 3.4048237800598145, "reward_std": 0.4206378608942032, "rewards/final_reward": 1.2012251051824736, "rewards/mask_iou_reward": 0.6006125525912368, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4048238396644592, "rewards/thk_ans_format_reward": 1.0, "step": 446, "think_completion_length": 57.71875 }, { "clip_ratio": 0.0, "completion_length": 130.703125, "epoch": 0.7537942664418212, "grad_norm": 5.4476413106060875, "kl": 0.24951171875, "learning_rate": 8.492411467116357e-07, "loss": 0.0003, "reward": 3.3550602197647095, "reward_std": 0.1978924423456192, "rewards/final_reward": 1.5025127897357047, "rewards/mask_iou_reward": 0.7512563948678523, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.355060338973999, "rewards/thk_ans_format_reward": 1.0, "step": 447, "think_completion_length": 49.40625 }, { "clip_ratio": 0.0, "completion_length": 139.90625, "epoch": 0.7554806070826307, "grad_norm": 4.176777412533717, "kl": 0.2626953125, "learning_rate": 8.489038785834738e-07, "loss": 0.0003, "reward": 3.3239400386810303, "reward_std": 0.23745188117027283, "rewards/final_reward": 1.7510122330778342, "rewards/mask_iou_reward": 0.8755061165389171, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3239399194717407, "rewards/thk_ans_format_reward": 1.0, "step": 448, "think_completion_length": 52.1875 }, { "clip_ratio": 0.0, "completion_length": 134.8125, "epoch": 0.7571669477234402, "grad_norm": 3.8561049155278124, "kl": 0.298828125, "learning_rate": 8.48566610455312e-07, "loss": 0.0003, "reward": 3.1175063848495483, "reward_std": 0.3207996618002653, "rewards/final_reward": 1.6053231020652237, "rewards/mask_iou_reward": 0.8026615510326118, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.1487562656402588, "rewards/thk_ans_format_reward": 0.984375, "step": 449, "think_completion_length": 54.65625 }, { "clip_ratio": 0.0, "completion_length": 129.65625, "epoch": 0.7588532883642496, "grad_norm": 4.769355012789226, "kl": 0.3505859375, "learning_rate": 8.482293423271501e-07, "loss": 0.0004, "reward": 2.6317962408065796, "reward_std": 0.16405843198299408, "rewards/final_reward": 0.31454674175310304, "rewards/mask_iou_reward": 0.15727337087655152, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6317960768938065, "rewards/thk_ans_format_reward": 1.0, "step": 450, "think_completion_length": 56.0625 }, { "clip_ratio": 0.0, "completion_length": 127.265625, "epoch": 0.760539629005059, "grad_norm": 7.400810968987018, "kl": 0.2763671875, "learning_rate": 8.478920741989882e-07, "loss": 0.0003, "reward": 3.499594807624817, "reward_std": 0.24827680736780167, "rewards/final_reward": 1.292793728943769, "rewards/mask_iou_reward": 0.6463968644718845, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4995947480201721, "rewards/thk_ans_format_reward": 1.0, "step": 451, "think_completion_length": 50.8125 }, { "clip_ratio": 0.0, "completion_length": 121.59375, "epoch": 0.7622259696458684, "grad_norm": 6.445672712478356, "kl": 0.341796875, "learning_rate": 8.475548060708263e-07, "loss": 0.0003, "reward": 2.749513268470764, "reward_std": 0.3254034221172333, "rewards/final_reward": 0.9117902511222327, "rewards/mask_iou_reward": 0.45589512556111633, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7495132386684418, "rewards/thk_ans_format_reward": 1.0, "step": 452, "think_completion_length": 55.96875 }, { "clip_ratio": 0.0, "completion_length": 136.390625, "epoch": 0.7639123102866779, "grad_norm": 13.204711329933593, "kl": 0.3173828125, "learning_rate": 8.472175379426643e-07, "loss": 0.0003, "reward": 3.0490126609802246, "reward_std": 0.40351493656635284, "rewards/final_reward": 1.0314188630098042, "rewards/mask_iou_reward": 0.5157094315049021, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0490127503871918, "rewards/thk_ans_format_reward": 1.0, "step": 453, "think_completion_length": 57.65625 }, { "clip_ratio": 0.0, "completion_length": 123.375, "epoch": 0.7655986509274874, "grad_norm": 6.324030150115455, "kl": 0.3359375, "learning_rate": 8.468802698145024e-07, "loss": 0.0003, "reward": 2.9564003944396973, "reward_std": 0.23579375445842743, "rewards/final_reward": 0.7703477680311337, "rewards/mask_iou_reward": 0.38517388401556685, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.956400454044342, "rewards/thk_ans_format_reward": 1.0, "step": 454, "think_completion_length": 49.78125 }, { "clip_ratio": 0.0, "completion_length": 125.578125, "epoch": 0.7672849915682968, "grad_norm": 4.925706148560462, "kl": 0.2919921875, "learning_rate": 8.465430016863406e-07, "loss": 0.0003, "reward": 3.0482248067855835, "reward_std": 0.3343783766031265, "rewards/final_reward": 1.1269153962105007, "rewards/mask_iou_reward": 0.5634576981052504, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0482248067855835, "rewards/thk_ans_format_reward": 1.0, "step": 455, "think_completion_length": 51.9375 }, { "clip_ratio": 0.0, "completion_length": 138.625, "epoch": 0.7689713322091062, "grad_norm": 5.465123188001045, "kl": 0.2822265625, "learning_rate": 8.462057335581787e-07, "loss": 0.0003, "reward": 3.0701215267181396, "reward_std": 0.14017308503389359, "rewards/final_reward": 1.8171637650895573, "rewards/mask_iou_reward": 0.9085818825447787, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.07012140750885, "rewards/thk_ans_format_reward": 1.0, "step": 456, "think_completion_length": 52.0 }, { "clip_ratio": 0.0, "completion_length": 120.828125, "epoch": 0.7706576728499157, "grad_norm": 4.005358062471752, "kl": 0.3115234375, "learning_rate": 8.458684654300168e-07, "loss": 0.0003, "reward": 2.987306594848633, "reward_std": 0.3185913637280464, "rewards/final_reward": 1.4051766924639586, "rewards/mask_iou_reward": 0.7025883462319793, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9873065650463104, "rewards/thk_ans_format_reward": 1.0, "step": 457, "think_completion_length": 49.625 }, { "clip_ratio": 0.0, "completion_length": 125.140625, "epoch": 0.7723440134907251, "grad_norm": 11.194917130532907, "kl": 0.30078125, "learning_rate": 8.45531197301855e-07, "loss": 0.0003, "reward": 3.5356940031051636, "reward_std": 0.1198413036763668, "rewards/final_reward": 1.6100836459413137, "rewards/mask_iou_reward": 0.8050418229706569, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5356940627098083, "rewards/thk_ans_format_reward": 1.0, "step": 458, "think_completion_length": 56.9375 }, { "clip_ratio": 0.0, "completion_length": 123.609375, "epoch": 0.7740303541315345, "grad_norm": 4.153232928144624, "kl": 0.322265625, "learning_rate": 8.451939291736931e-07, "loss": 0.0003, "reward": 3.207147717475891, "reward_std": 0.38715776801109314, "rewards/final_reward": 1.0808576457404304, "rewards/mask_iou_reward": 0.5404288228702152, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2071476578712463, "rewards/thk_ans_format_reward": 1.0, "step": 459, "think_completion_length": 52.28125 }, { "clip_ratio": 0.0, "completion_length": 155.765625, "epoch": 0.7757166947723441, "grad_norm": 10.159888358886413, "kl": 0.2685546875, "learning_rate": 8.448566610455311e-07, "loss": 0.0003, "reward": 2.656245470046997, "reward_std": 0.19464854151010513, "rewards/final_reward": 0.43508841005784793, "rewards/mask_iou_reward": 0.21754420502892396, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6562454402446747, "rewards/thk_ans_format_reward": 1.0, "step": 460, "think_completion_length": 49.96875 }, { "clip_ratio": 0.0, "completion_length": 178.5625, "epoch": 0.7774030354131535, "grad_norm": 4.856925040426539, "kl": 0.3203125, "learning_rate": 8.445193929173693e-07, "loss": 0.0003, "reward": 3.1314727067947388, "reward_std": 0.17312223464250565, "rewards/final_reward": 1.608206343195508, "rewards/mask_iou_reward": 0.804103171597754, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1314727067947388, "rewards/thk_ans_format_reward": 1.0, "step": 461, "think_completion_length": 58.75 }, { "clip_ratio": 0.0, "completion_length": 143.546875, "epoch": 0.7790893760539629, "grad_norm": 14.882051882248911, "kl": 0.291015625, "learning_rate": 8.441821247892073e-07, "loss": 0.0003, "reward": 2.922551989555359, "reward_std": 0.24346740171313286, "rewards/final_reward": 1.2441537792490682, "rewards/mask_iou_reward": 0.6220768896245341, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9225519299507141, "rewards/thk_ans_format_reward": 1.0, "step": 462, "think_completion_length": 56.40625 }, { "clip_ratio": 0.0, "completion_length": 156.5, "epoch": 0.7807757166947723, "grad_norm": 7.4432447360727005, "kl": 0.28662109375, "learning_rate": 8.438448566610455e-07, "loss": 0.0003, "reward": 3.0491660833358765, "reward_std": 0.3176800534129143, "rewards/final_reward": 0.5361547861618222, "rewards/mask_iou_reward": 0.2680773930809111, "rewards/sam_format_reward": 0.9375, "rewards/sam_reward_func_ultra": 1.1116660237312317, "rewards/thk_ans_format_reward": 1.0, "step": 463, "think_completion_length": 61.84375 }, { "clip_ratio": 0.0, "completion_length": 134.71875, "epoch": 0.7824620573355818, "grad_norm": 8.644391998731278, "kl": 0.322265625, "learning_rate": 8.435075885328836e-07, "loss": 0.0003, "reward": 3.0543339252471924, "reward_std": 0.13533685728907585, "rewards/final_reward": 1.2730188577493136, "rewards/mask_iou_reward": 0.6365094288746568, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0543338507413864, "rewards/thk_ans_format_reward": 1.0, "step": 464, "think_completion_length": 53.4375 }, { "clip_ratio": 0.0, "completion_length": 169.140625, "epoch": 0.7841483979763912, "grad_norm": 236.67023834501367, "kl": 0.2685546875, "learning_rate": 8.431703204047217e-07, "loss": 0.0003, "reward": 3.6658883094787598, "reward_std": 0.15900836139917374, "rewards/final_reward": 1.5048619609851641, "rewards/mask_iou_reward": 0.7524309804925821, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.665888249874115, "rewards/thk_ans_format_reward": 1.0, "step": 465, "think_completion_length": 60.9375 }, { "clip_ratio": 0.0, "completion_length": 156.609375, "epoch": 0.7858347386172007, "grad_norm": 4.741441128552988, "kl": 0.2890625, "learning_rate": 8.428330522765599e-07, "loss": 0.0003, "reward": 3.2664116621017456, "reward_std": 0.20561707019805908, "rewards/final_reward": 1.6148482767720258, "rewards/mask_iou_reward": 0.8074241383860129, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.266411691904068, "rewards/thk_ans_format_reward": 1.0, "step": 466, "think_completion_length": 53.75 }, { "clip_ratio": 0.0, "completion_length": 149.984375, "epoch": 0.7875210792580101, "grad_norm": 6.036213971503519, "kl": 0.310546875, "learning_rate": 8.42495784148398e-07, "loss": 0.0003, "reward": 3.273001790046692, "reward_std": 0.23280290514230728, "rewards/final_reward": 1.3179749980699165, "rewards/mask_iou_reward": 0.6589874990349582, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2730017304420471, "rewards/thk_ans_format_reward": 1.0, "step": 467, "think_completion_length": 58.03125 }, { "clip_ratio": 0.0, "completion_length": 154.84375, "epoch": 0.7892074198988196, "grad_norm": 7.819806233732989, "kl": 0.322265625, "learning_rate": 8.42158516020236e-07, "loss": 0.0003, "reward": 2.7713377475738525, "reward_std": 0.21657298505306244, "rewards/final_reward": 0.8699708025221008, "rewards/mask_iou_reward": 0.4349854012610504, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7713376581668854, "rewards/thk_ans_format_reward": 1.0, "step": 468, "think_completion_length": 59.96875 }, { "clip_ratio": 0.0, "completion_length": 165.90625, "epoch": 0.790893760539629, "grad_norm": 5.404044503889141, "kl": 0.3291015625, "learning_rate": 8.418212478920742e-07, "loss": 0.0003, "reward": 3.161504626274109, "reward_std": 0.2850091755390167, "rewards/final_reward": 1.14387468377842, "rewards/mask_iou_reward": 0.57193734188921, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1615045070648193, "rewards/thk_ans_format_reward": 1.0, "step": 469, "think_completion_length": 56.34375 }, { "clip_ratio": 0.0, "completion_length": 176.96875, "epoch": 0.7925801011804384, "grad_norm": 7.604389194636123, "kl": 0.3642578125, "learning_rate": 8.414839797639123e-07, "loss": 0.0004, "reward": 2.8981428146362305, "reward_std": 0.36169466376304626, "rewards/final_reward": 1.0320069000804137, "rewards/mask_iou_reward": 0.5160034500402069, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8981429636478424, "rewards/thk_ans_format_reward": 1.0, "step": 470, "think_completion_length": 65.84375 }, { "clip_ratio": 0.0, "completion_length": 149.40625, "epoch": 0.7942664418212478, "grad_norm": 12.40271629362927, "kl": 0.27734375, "learning_rate": 8.411467116357503e-07, "loss": 0.0003, "reward": 3.4766829013824463, "reward_std": 0.17510409653186798, "rewards/final_reward": 1.563823732939018, "rewards/mask_iou_reward": 0.781911866469509, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4766828417778015, "rewards/thk_ans_format_reward": 1.0, "step": 471, "think_completion_length": 58.78125 }, { "clip_ratio": 0.0, "completion_length": 139.3125, "epoch": 0.7959527824620574, "grad_norm": 4.105674434527155, "kl": 0.556640625, "learning_rate": 8.408094435075885e-07, "loss": 0.0006, "reward": 3.3228079080581665, "reward_std": 0.14854015782475471, "rewards/final_reward": 1.5614363244482155, "rewards/mask_iou_reward": 0.7807181622241077, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3228079676628113, "rewards/thk_ans_format_reward": 1.0, "step": 472, "think_completion_length": 53.9375 }, { "clip_ratio": 0.0, "completion_length": 143.515625, "epoch": 0.7976391231028668, "grad_norm": 14.121466586094188, "kl": 0.3408203125, "learning_rate": 8.404721753794266e-07, "loss": 0.0003, "reward": 2.8374698162078857, "reward_std": 0.2110204752534628, "rewards/final_reward": 0.6061206330509881, "rewards/mask_iou_reward": 0.30306031652549403, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8374697864055634, "rewards/thk_ans_format_reward": 1.0, "step": 473, "think_completion_length": 55.65625 }, { "clip_ratio": 0.0, "completion_length": 145.0625, "epoch": 0.7993254637436762, "grad_norm": 9.046995607060756, "kl": 0.3359375, "learning_rate": 8.401349072512647e-07, "loss": 0.0003, "reward": 3.3081858158111572, "reward_std": 0.3434343636035919, "rewards/final_reward": 1.4127900623675984, "rewards/mask_iou_reward": 0.7063950311837992, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3081855773925781, "rewards/thk_ans_format_reward": 1.0, "step": 474, "think_completion_length": 59.5625 }, { "clip_ratio": 0.0, "completion_length": 182.328125, "epoch": 0.8010118043844857, "grad_norm": 16.95326296908787, "kl": 0.2724609375, "learning_rate": 8.397976391231029e-07, "loss": 0.0003, "reward": 3.488841414451599, "reward_std": 0.2021305412054062, "rewards/final_reward": 1.6205699107923395, "rewards/mask_iou_reward": 0.8102849553961697, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4888412356376648, "rewards/thk_ans_format_reward": 1.0, "step": 475, "think_completion_length": 53.53125 }, { "clip_ratio": 0.0, "completion_length": 144.9375, "epoch": 0.8026981450252951, "grad_norm": 8.257579139699448, "kl": 0.353515625, "learning_rate": 8.39460370994941e-07, "loss": 0.0004, "reward": 3.1359875202178955, "reward_std": 0.1910172551870346, "rewards/final_reward": 0.6040533924945672, "rewards/mask_iou_reward": 0.3020266962472836, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1359875202178955, "rewards/thk_ans_format_reward": 1.0, "step": 476, "think_completion_length": 60.5625 }, { "clip_ratio": 0.0, "completion_length": 162.265625, "epoch": 0.8043844856661045, "grad_norm": 8.58037790808054, "kl": 0.2919921875, "learning_rate": 8.39123102866779e-07, "loss": 0.0003, "reward": 3.001362442970276, "reward_std": 0.11581205576658249, "rewards/final_reward": 0.34856413924459007, "rewards/mask_iou_reward": 0.17428206962229503, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0013624429702759, "rewards/thk_ans_format_reward": 1.0, "step": 477, "think_completion_length": 56.375 }, { "clip_ratio": 0.0, "completion_length": 139.265625, "epoch": 0.806070826306914, "grad_norm": 3.9427534797600767, "kl": 0.3232421875, "learning_rate": 8.387858347386172e-07, "loss": 0.0003, "reward": 3.370658040046692, "reward_std": 0.23730356991291046, "rewards/final_reward": 1.6140919982067383, "rewards/mask_iou_reward": 0.8070459991033692, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.370658040046692, "rewards/thk_ans_format_reward": 1.0, "step": 478, "think_completion_length": 56.53125 }, { "clip_ratio": 0.0, "completion_length": 159.390625, "epoch": 0.8077571669477235, "grad_norm": 3.3744960399290114, "kl": 0.2802734375, "learning_rate": 8.384485666104552e-07, "loss": 0.0003, "reward": 2.2663588523864746, "reward_std": 0.2115717504057102, "rewards/final_reward": 0.26012163497015883, "rewards/mask_iou_reward": 0.13006081748507942, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.26635879278182983, "rewards/thk_ans_format_reward": 1.0, "step": 479, "think_completion_length": 56.5625 }, { "clip_ratio": 0.0, "completion_length": 158.765625, "epoch": 0.8094435075885329, "grad_norm": 3.6117344364232564, "kl": 0.2666015625, "learning_rate": 8.381112984822933e-07, "loss": 0.0003, "reward": 2.8762000799179077, "reward_std": 0.388136625289917, "rewards/final_reward": 0.9140486507211179, "rewards/mask_iou_reward": 0.4570243253605589, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8762001395225525, "rewards/thk_ans_format_reward": 1.0, "step": 480, "think_completion_length": 63.28125 }, { "clip_ratio": 0.0, "completion_length": 156.65625, "epoch": 0.8111298482293423, "grad_norm": 12.274297517004138, "kl": 0.478515625, "learning_rate": 8.377740303541315e-07, "loss": 0.0005, "reward": 3.2747581005096436, "reward_std": 0.26797255873680115, "rewards/final_reward": 1.6817001867058976, "rewards/mask_iou_reward": 0.8408500933529488, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.2903832793235779, "rewards/thk_ans_format_reward": 1.0, "step": 481, "think_completion_length": 52.1875 }, { "clip_ratio": 0.0, "completion_length": 161.09375, "epoch": 0.8128161888701517, "grad_norm": 5.527163191205217, "kl": 0.3662109375, "learning_rate": 8.374367622259696e-07, "loss": 0.0004, "reward": 2.8398313522338867, "reward_std": 0.3691897839307785, "rewards/final_reward": 0.5070145637787651, "rewards/mask_iou_reward": 0.25350728188938254, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8398312926292419, "rewards/thk_ans_format_reward": 1.0, "step": 482, "think_completion_length": 60.375 }, { "clip_ratio": 0.0, "completion_length": 161.4375, "epoch": 0.8145025295109612, "grad_norm": 9.457144918818893, "kl": 0.3134765625, "learning_rate": 8.370994940978077e-07, "loss": 0.0003, "reward": 2.8676928281784058, "reward_std": 0.21387110650539398, "rewards/final_reward": 0.7955313358824923, "rewards/mask_iou_reward": 0.3977656679412461, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8676928579807281, "rewards/thk_ans_format_reward": 1.0, "step": 483, "think_completion_length": 56.15625 }, { "clip_ratio": 0.0, "completion_length": 162.484375, "epoch": 0.8161888701517707, "grad_norm": 5.765839208328575, "kl": 0.291015625, "learning_rate": 8.367622259696459e-07, "loss": 0.0003, "reward": 3.332192301750183, "reward_std": 0.2074635624885559, "rewards/final_reward": 1.3884839448598976, "rewards/mask_iou_reward": 0.6942419724299488, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3321923613548279, "rewards/thk_ans_format_reward": 1.0, "step": 484, "think_completion_length": 60.9375 }, { "clip_ratio": 0.0, "completion_length": 162.515625, "epoch": 0.8178752107925801, "grad_norm": 13.058251127540307, "kl": 0.291015625, "learning_rate": 8.364249578414839e-07, "loss": 0.0003, "reward": 3.4136351346969604, "reward_std": 0.1272813342511654, "rewards/final_reward": 1.6252411969988985, "rewards/mask_iou_reward": 0.8126205984994492, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.413635015487671, "rewards/thk_ans_format_reward": 1.0, "step": 485, "think_completion_length": 53.9375 }, { "clip_ratio": 0.0, "completion_length": 172.484375, "epoch": 0.8195615514333895, "grad_norm": 4.59058157342402, "kl": 0.2724609375, "learning_rate": 8.360876897133221e-07, "loss": 0.0003, "reward": 2.841328501701355, "reward_std": 0.2238992303609848, "rewards/final_reward": 0.4358989520701159, "rewards/mask_iou_reward": 0.21794947603505796, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8413284122943878, "rewards/thk_ans_format_reward": 1.0, "step": 486, "think_completion_length": 57.1875 }, { "clip_ratio": 0.0, "completion_length": 144.03125, "epoch": 0.821247892074199, "grad_norm": 5.2583036747207865, "kl": 0.296875, "learning_rate": 8.357504215851602e-07, "loss": 0.0003, "reward": 3.279032826423645, "reward_std": 0.34882715344429016, "rewards/final_reward": 1.1094078905881353, "rewards/mask_iou_reward": 0.5547039452940676, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2790327668190002, "rewards/thk_ans_format_reward": 1.0, "step": 487, "think_completion_length": 56.75 }, { "clip_ratio": 0.0, "completion_length": 164.1875, "epoch": 0.8229342327150084, "grad_norm": 12.262889885729598, "kl": 0.5107421875, "learning_rate": 8.354131534569982e-07, "loss": 0.0005, "reward": 2.6642863750457764, "reward_std": 0.4135005921125412, "rewards/final_reward": 0.902978774756644, "rewards/mask_iou_reward": 0.451489387378322, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.679911345243454, "rewards/thk_ans_format_reward": 1.0, "step": 488, "think_completion_length": 57.9375 }, { "clip_ratio": 0.0, "completion_length": 147.296875, "epoch": 0.8246205733558178, "grad_norm": 6.485249587949392, "kl": 0.322265625, "learning_rate": 8.350758853288364e-07, "loss": 0.0003, "reward": 3.394331693649292, "reward_std": 0.16855868697166443, "rewards/final_reward": 1.1107550160110016, "rewards/mask_iou_reward": 0.5553775080055008, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3943317532539368, "rewards/thk_ans_format_reward": 1.0, "step": 489, "think_completion_length": 57.125 }, { "clip_ratio": 0.0, "completion_length": 168.453125, "epoch": 0.8263069139966274, "grad_norm": 8.664907971971088, "kl": 0.34765625, "learning_rate": 8.347386172006745e-07, "loss": 0.0003, "reward": 2.957345962524414, "reward_std": 0.19981549307703972, "rewards/final_reward": 0.8727845301331817, "rewards/mask_iou_reward": 0.43639226506659085, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9573458433151245, "rewards/thk_ans_format_reward": 1.0, "step": 490, "think_completion_length": 65.65625 }, { "clip_ratio": 0.0, "completion_length": 142.703125, "epoch": 0.8279932546374368, "grad_norm": 5.276331158403494, "kl": 0.3896484375, "learning_rate": 8.344013490725126e-07, "loss": 0.0004, "reward": 3.023526191711426, "reward_std": 0.3681245595216751, "rewards/final_reward": 0.7070453517651918, "rewards/mask_iou_reward": 0.3535226758825959, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 1.05477637052536, "rewards/thk_ans_format_reward": 1.0, "step": 491, "think_completion_length": 53.125 }, { "clip_ratio": 0.0, "completion_length": 135.84375, "epoch": 0.8296795952782462, "grad_norm": 15.27579742522537, "kl": 0.3515625, "learning_rate": 8.340640809443508e-07, "loss": 0.0004, "reward": 3.013556480407715, "reward_std": 0.594033494591713, "rewards/final_reward": 0.626667284580518, "rewards/mask_iou_reward": 0.313333642290259, "rewards/sam_format_reward": 0.9375, "rewards/sam_reward_func_ultra": 1.0760565400123596, "rewards/thk_ans_format_reward": 1.0, "step": 492, "think_completion_length": 55.375 }, { "clip_ratio": 0.0, "completion_length": 133.5, "epoch": 0.8313659359190556, "grad_norm": 4.6907462569949585, "kl": 0.5, "learning_rate": 8.337268128161888e-07, "loss": 0.0005, "reward": 2.9373987913131714, "reward_std": 0.1304899863898754, "rewards/final_reward": 0.9829659043346416, "rewards/mask_iou_reward": 0.4914829521673208, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9373987317085266, "rewards/thk_ans_format_reward": 1.0, "step": 493, "think_completion_length": 58.46875 }, { "clip_ratio": 0.0, "completion_length": 128.6875, "epoch": 0.8330522765598651, "grad_norm": 9.07332574925447, "kl": 0.3671875, "learning_rate": 8.333895446880269e-07, "loss": 0.0004, "reward": 3.1510684490203857, "reward_std": 0.17891812324523926, "rewards/final_reward": 0.48630485725933903, "rewards/mask_iou_reward": 0.24315242862966951, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1510685086250305, "rewards/thk_ans_format_reward": 1.0, "step": 494, "think_completion_length": 58.9375 }, { "clip_ratio": 0.0, "completion_length": 137.796875, "epoch": 0.8347386172006745, "grad_norm": 4.086735711528925, "kl": 0.3623046875, "learning_rate": 8.330522765598651e-07, "loss": 0.0004, "reward": 3.338440179824829, "reward_std": 0.2079987023025751, "rewards/final_reward": 0.8487996371223201, "rewards/mask_iou_reward": 0.42439981856116005, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.3540650606155396, "rewards/thk_ans_format_reward": 1.0, "step": 495, "think_completion_length": 58.6875 }, { "clip_ratio": 0.0, "completion_length": 155.125, "epoch": 0.836424957841484, "grad_norm": 5.467494355508177, "kl": 0.3486328125, "learning_rate": 8.327150084317032e-07, "loss": 0.0003, "reward": 2.9036959409713745, "reward_std": 0.0915520153939724, "rewards/final_reward": 0.5634759685080848, "rewards/mask_iou_reward": 0.2817379842540424, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9036959111690521, "rewards/thk_ans_format_reward": 1.0, "step": 496, "think_completion_length": 55.15625 }, { "clip_ratio": 0.0, "completion_length": 152.734375, "epoch": 0.8381112984822934, "grad_norm": 4.609982837739563, "kl": 0.3603515625, "learning_rate": 8.323777403035412e-07, "loss": 0.0004, "reward": 2.9951345920562744, "reward_std": 0.14687485992908478, "rewards/final_reward": 0.5120508227904494, "rewards/mask_iou_reward": 0.2560254113952247, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.995134562253952, "rewards/thk_ans_format_reward": 1.0, "step": 497, "think_completion_length": 57.15625 }, { "clip_ratio": 0.0, "completion_length": 132.765625, "epoch": 0.8397976391231029, "grad_norm": 4.49715286866878, "kl": 0.6982421875, "learning_rate": 8.320404721753794e-07, "loss": 0.0007, "reward": 2.702091932296753, "reward_std": 0.06957501918077469, "rewards/final_reward": 0.5184969514126482, "rewards/mask_iou_reward": 0.2592484757063241, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.702091857790947, "rewards/thk_ans_format_reward": 1.0, "step": 498, "think_completion_length": 57.03125 }, { "clip_ratio": 0.0, "completion_length": 137.21875, "epoch": 0.8414839797639123, "grad_norm": 5.876436406383071, "kl": 0.41015625, "learning_rate": 8.317032040472175e-07, "loss": 0.0004, "reward": 3.532025933265686, "reward_std": 0.31206123530864716, "rewards/final_reward": 1.6488765208842402, "rewards/mask_iou_reward": 0.8244382604421201, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5320259928703308, "rewards/thk_ans_format_reward": 1.0, "step": 499, "think_completion_length": 60.40625 }, { "clip_ratio": 0.0, "completion_length": 129.609375, "epoch": 0.8431703204047217, "grad_norm": 8.430320529931935, "kl": 0.3818359375, "learning_rate": 8.313659359190556e-07, "loss": 0.0004, "reward": 3.1462095975875854, "reward_std": 0.14214863628149033, "rewards/final_reward": 1.4178498719028585, "rewards/mask_iou_reward": 0.7089249359514292, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1462095975875854, "rewards/thk_ans_format_reward": 1.0, "step": 500, "think_completion_length": 55.28125 }, { "clip_ratio": 0.0, "completion_length": 143.984375, "epoch": 0.8448566610455311, "grad_norm": 11.055793749524597, "kl": 0.37109375, "learning_rate": 8.310286677908938e-07, "loss": 0.0004, "reward": 3.4042888879776, "reward_std": 0.2672403007745743, "rewards/final_reward": 1.3030006843771489, "rewards/mask_iou_reward": 0.6515003421885744, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4042889475822449, "rewards/thk_ans_format_reward": 1.0, "step": 501, "think_completion_length": 59.84375 }, { "clip_ratio": 0.0, "completion_length": 159.15625, "epoch": 0.8465430016863407, "grad_norm": 4.353326318987238, "kl": 0.8125, "learning_rate": 8.306913996627318e-07, "loss": 0.0008, "reward": 2.6968055963516235, "reward_std": 0.42137467861175537, "rewards/final_reward": 0.5881671125422409, "rewards/mask_iou_reward": 0.3264447243490181, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.7124305069446564, "rewards/thk_ans_format_reward": 1.0, "step": 502, "think_completion_length": 57.15625 }, { "clip_ratio": 0.0, "completion_length": 144.453125, "epoch": 0.8482293423271501, "grad_norm": 3.952799501986196, "kl": 0.3857421875, "learning_rate": 8.303541315345699e-07, "loss": 0.0004, "reward": 3.111401081085205, "reward_std": 0.23835711553692818, "rewards/final_reward": 1.4018418315631394, "rewards/mask_iou_reward": 0.7009209157815697, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.111401081085205, "rewards/thk_ans_format_reward": 1.0, "step": 503, "think_completion_length": 58.1875 }, { "clip_ratio": 0.0, "completion_length": 140.921875, "epoch": 0.8499156829679595, "grad_norm": 6.092795086100114, "kl": 0.408203125, "learning_rate": 8.300168634064081e-07, "loss": 0.0004, "reward": 3.0179593563079834, "reward_std": 0.43792441487312317, "rewards/final_reward": 1.1153791394943096, "rewards/mask_iou_reward": 0.5576895697471548, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0179592669010162, "rewards/thk_ans_format_reward": 1.0, "step": 504, "think_completion_length": 63.9375 }, { "clip_ratio": 0.0, "completion_length": 137.109375, "epoch": 0.851602023608769, "grad_norm": 5.84142921702047, "kl": 0.38671875, "learning_rate": 8.296795952782462e-07, "loss": 0.0004, "reward": 2.6653823852539062, "reward_std": 0.3925536721944809, "rewards/final_reward": 1.2697986540779345, "rewards/mask_iou_reward": 0.6348993270389672, "rewards/sam_format_reward": 0.875, "rewards/sam_reward_func_ultra": 0.790382444858551, "rewards/thk_ans_format_reward": 1.0, "step": 505, "think_completion_length": 60.0 }, { "clip_ratio": 0.0, "completion_length": 147.421875, "epoch": 0.8532883642495784, "grad_norm": 9.174156204307186, "kl": 0.3623046875, "learning_rate": 8.293423271500842e-07, "loss": 0.0004, "reward": 3.216946005821228, "reward_std": 0.12585578113794327, "rewards/final_reward": 1.8653463056949018, "rewards/mask_iou_reward": 0.9326731528474509, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.216946005821228, "rewards/thk_ans_format_reward": 1.0, "step": 506, "think_completion_length": 56.71875 }, { "clip_ratio": 0.0, "completion_length": 136.125, "epoch": 0.8549747048903878, "grad_norm": 4.338273951521515, "kl": 0.4072265625, "learning_rate": 8.290050590219224e-07, "loss": 0.0004, "reward": 2.273374557495117, "reward_std": 0.08910224586725235, "rewards/final_reward": 0.3796306792781109, "rewards/mask_iou_reward": 0.18981533963905545, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.2733745574951172, "rewards/thk_ans_format_reward": 1.0, "step": 507, "think_completion_length": 64.34375 }, { "clip_ratio": 0.0, "completion_length": 136.53125, "epoch": 0.8566610455311973, "grad_norm": 4.048280846796833, "kl": 0.3779296875, "learning_rate": 8.286677908937605e-07, "loss": 0.0004, "reward": 3.340023159980774, "reward_std": 0.14114241860806942, "rewards/final_reward": 1.3392693923469996, "rewards/mask_iou_reward": 0.6696346961734998, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3400232195854187, "rewards/thk_ans_format_reward": 1.0, "step": 508, "think_completion_length": 62.96875 }, { "clip_ratio": 0.0, "completion_length": 134.5, "epoch": 0.8583473861720068, "grad_norm": 6.439504346827565, "kl": 0.4326171875, "learning_rate": 8.283305227655986e-07, "loss": 0.0004, "reward": 3.283868193626404, "reward_std": 0.1502309814095497, "rewards/final_reward": 1.4718184265458358, "rewards/mask_iou_reward": 0.7359092132729179, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2838680744171143, "rewards/thk_ans_format_reward": 1.0, "step": 509, "think_completion_length": 65.03125 }, { "clip_ratio": 0.0, "completion_length": 139.0625, "epoch": 0.8600337268128162, "grad_norm": 6.735960884512077, "kl": 0.40234375, "learning_rate": 8.279932546374367e-07, "loss": 0.0004, "reward": 2.945590615272522, "reward_std": 0.16057924553751945, "rewards/final_reward": 1.1542222895062477, "rewards/mask_iou_reward": 0.5771111447531239, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9455906599760056, "rewards/thk_ans_format_reward": 1.0, "step": 510, "think_completion_length": 63.34375 }, { "clip_ratio": 0.0, "completion_length": 150.5625, "epoch": 0.8617200674536256, "grad_norm": 5.712862299060118, "kl": 0.4208984375, "learning_rate": 8.276559865092748e-07, "loss": 0.0004, "reward": 2.683882713317871, "reward_std": 0.37942691147327423, "rewards/final_reward": 0.5198905041900906, "rewards/mask_iou_reward": 0.2599452520950453, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6838827431201935, "rewards/thk_ans_format_reward": 1.0, "step": 511, "think_completion_length": 63.46875 }, { "clip_ratio": 0.0, "completion_length": 163.46875, "epoch": 0.863406408094435, "grad_norm": 28.652231899021555, "kl": 0.3955078125, "learning_rate": 8.27318718381113e-07, "loss": 0.0004, "reward": 2.9515405893325806, "reward_std": 0.3559226468205452, "rewards/final_reward": 1.0275246177182993, "rewards/mask_iou_reward": 0.5137623088591496, "rewards/sam_format_reward": 0.890625, "rewards/sam_reward_func_ultra": 1.060915619134903, "rewards/thk_ans_format_reward": 1.0, "step": 512, "think_completion_length": 66.75 }, { "clip_ratio": 0.0, "completion_length": 143.171875, "epoch": 0.8650927487352446, "grad_norm": 4.1058269502843645, "kl": 0.3701171875, "learning_rate": 8.269814502529511e-07, "loss": 0.0004, "reward": 2.8999738693237305, "reward_std": 0.0674455501139164, "rewards/final_reward": 0.47537297320235095, "rewards/mask_iou_reward": 0.23768648660117547, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8999738991260529, "rewards/thk_ans_format_reward": 1.0, "step": 513, "think_completion_length": 61.0625 }, { "clip_ratio": 0.0, "completion_length": 166.328125, "epoch": 0.866779089376054, "grad_norm": 2.9463795478386157, "kl": 0.376953125, "learning_rate": 8.266441821247892e-07, "loss": 0.0004, "reward": 3.56209397315979, "reward_std": 0.24439280480146408, "rewards/final_reward": 1.7721260543580972, "rewards/mask_iou_reward": 0.8860630271790486, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.562093734741211, "rewards/thk_ans_format_reward": 1.0, "step": 514, "think_completion_length": 68.125 }, { "clip_ratio": 0.0, "completion_length": 168.453125, "epoch": 0.8684654300168634, "grad_norm": 5.345688348718463, "kl": 0.373046875, "learning_rate": 8.263069139966273e-07, "loss": 0.0004, "reward": 2.648374915122986, "reward_std": 0.14371401071548462, "rewards/final_reward": 0.8322886750975954, "rewards/mask_iou_reward": 0.4161443375487977, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6483748480677605, "rewards/thk_ans_format_reward": 1.0, "step": 515, "think_completion_length": 63.59375 }, { "clip_ratio": 0.0, "completion_length": 231.734375, "epoch": 0.8701517706576728, "grad_norm": 6.266616889593935, "kl": 0.3095703125, "learning_rate": 8.259696458684654e-07, "loss": 0.0003, "reward": 3.183814764022827, "reward_std": 0.20516617968678474, "rewards/final_reward": 1.4832190450462837, "rewards/mask_iou_reward": 0.7416095225231418, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.199439823627472, "rewards/thk_ans_format_reward": 1.0, "step": 516, "think_completion_length": 63.03125 }, { "clip_ratio": 0.0, "completion_length": 145.78125, "epoch": 0.8718381112984823, "grad_norm": 6.359772663934902, "kl": 0.396484375, "learning_rate": 8.256323777403035e-07, "loss": 0.0004, "reward": 3.377490758895874, "reward_std": 0.16431526839733124, "rewards/final_reward": 1.2387319760179651, "rewards/mask_iou_reward": 0.6193659880089826, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3774908185005188, "rewards/thk_ans_format_reward": 1.0, "step": 517, "think_completion_length": 58.28125 }, { "clip_ratio": 0.0, "completion_length": 132.671875, "epoch": 0.8735244519392917, "grad_norm": 8.28235082705572, "kl": 0.392578125, "learning_rate": 8.252951096121416e-07, "loss": 0.0004, "reward": 3.2795443534851074, "reward_std": 0.2162407599389553, "rewards/final_reward": 1.4022729958833613, "rewards/mask_iou_reward": 0.7011364979416806, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2795442342758179, "rewards/thk_ans_format_reward": 1.0, "step": 518, "think_completion_length": 60.65625 }, { "clip_ratio": 0.0, "completion_length": 139.65625, "epoch": 0.8752107925801011, "grad_norm": 16.66235998201711, "kl": 0.38671875, "learning_rate": 8.249578414839797e-07, "loss": 0.0004, "reward": 3.3192691802978516, "reward_std": 0.16263797972351313, "rewards/final_reward": 1.0817492556014303, "rewards/mask_iou_reward": 0.5408746278007152, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3192691802978516, "rewards/thk_ans_format_reward": 1.0, "step": 519, "think_completion_length": 62.71875 }, { "clip_ratio": 0.0, "completion_length": 170.21875, "epoch": 0.8768971332209107, "grad_norm": 7.827687818773585, "kl": 0.330078125, "learning_rate": 8.246205733558178e-07, "loss": 0.0003, "reward": 2.7803075313568115, "reward_std": 0.31534768640995026, "rewards/final_reward": 0.7727706577683782, "rewards/mask_iou_reward": 0.3863853288841891, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.7959325313568115, "rewards/thk_ans_format_reward": 1.0, "step": 520, "think_completion_length": 74.78125 }, { "clip_ratio": 0.0, "completion_length": 165.203125, "epoch": 0.8785834738617201, "grad_norm": 4.032912423168109, "kl": 0.353515625, "learning_rate": 8.24283305227656e-07, "loss": 0.0004, "reward": 3.183266282081604, "reward_std": 0.2568514347076416, "rewards/final_reward": 1.327475335566459, "rewards/mask_iou_reward": 0.6637376677832295, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1832663118839264, "rewards/thk_ans_format_reward": 1.0, "step": 521, "think_completion_length": 65.15625 }, { "clip_ratio": 0.0, "completion_length": 164.828125, "epoch": 0.8802698145025295, "grad_norm": 10.421736927007183, "kl": 0.3662109375, "learning_rate": 8.239460370994941e-07, "loss": 0.0004, "reward": 3.470924496650696, "reward_std": 0.10463305935263634, "rewards/final_reward": 1.0585844424707966, "rewards/mask_iou_reward": 0.5292922212353983, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4709245562553406, "rewards/thk_ans_format_reward": 1.0, "step": 522, "think_completion_length": 58.96875 }, { "clip_ratio": 0.0, "completion_length": 138.484375, "epoch": 0.8819561551433389, "grad_norm": 5.559082708566229, "kl": 0.5234375, "learning_rate": 8.236087689713322e-07, "loss": 0.0005, "reward": 3.0396599769592285, "reward_std": 0.157942034304142, "rewards/final_reward": 1.1188812180385286, "rewards/mask_iou_reward": 0.5594406090192643, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0396599769592285, "rewards/thk_ans_format_reward": 1.0, "step": 523, "think_completion_length": 67.59375 }, { "clip_ratio": 0.0, "completion_length": 135.484375, "epoch": 0.8836424957841484, "grad_norm": 6.252306064416424, "kl": 0.4580078125, "learning_rate": 8.232715008431703e-07, "loss": 0.0005, "reward": 3.2086377143859863, "reward_std": 0.2545919269323349, "rewards/final_reward": 1.1811606394896785, "rewards/mask_iou_reward": 0.5905803197448393, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2086376249790192, "rewards/thk_ans_format_reward": 1.0, "step": 524, "think_completion_length": 64.6875 }, { "clip_ratio": 0.0, "completion_length": 203.140625, "epoch": 0.8853288364249579, "grad_norm": 4.371512130440283, "kl": 0.5361328125, "learning_rate": 8.229342327150084e-07, "loss": 0.0005, "reward": 3.3854466676712036, "reward_std": 0.10468383133411407, "rewards/final_reward": 1.4435736608997933, "rewards/mask_iou_reward": 0.7217868304498967, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3854466080665588, "rewards/thk_ans_format_reward": 1.0, "step": 525, "think_completion_length": 67.21875 }, { "clip_ratio": 0.0, "completion_length": 144.875, "epoch": 0.8870151770657673, "grad_norm": 5.152020009962662, "kl": 0.3486328125, "learning_rate": 8.225969645868464e-07, "loss": 0.0003, "reward": 3.441176652908325, "reward_std": 0.2706274315714836, "rewards/final_reward": 1.7451023757817863, "rewards/mask_iou_reward": 0.8725511878908931, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4411765933036804, "rewards/thk_ans_format_reward": 1.0, "step": 526, "think_completion_length": 72.34375 }, { "clip_ratio": 0.0, "completion_length": 155.625, "epoch": 0.8887015177065767, "grad_norm": 21.687079355389578, "kl": 0.3818359375, "learning_rate": 8.222596964586846e-07, "loss": 0.0004, "reward": 2.9406185150146484, "reward_std": 0.2899327874183655, "rewards/final_reward": 0.8891764659295585, "rewards/mask_iou_reward": 0.44458823296477923, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9406185150146484, "rewards/thk_ans_format_reward": 1.0, "step": 527, "think_completion_length": 60.40625 }, { "clip_ratio": 0.0, "completion_length": 143.828125, "epoch": 0.8903878583473862, "grad_norm": 4.36400370590007, "kl": 0.375, "learning_rate": 8.219224283305227e-07, "loss": 0.0004, "reward": 2.7132362127304077, "reward_std": 0.1695428118109703, "rewards/final_reward": 0.23595122885585143, "rewards/mask_iou_reward": 0.11797561442792572, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7132362425327301, "rewards/thk_ans_format_reward": 1.0, "step": 528, "think_completion_length": 64.15625 }, { "clip_ratio": 0.0, "completion_length": 168.65625, "epoch": 0.8920741989881956, "grad_norm": 13.437495085356725, "kl": 0.4208984375, "learning_rate": 8.215851602023608e-07, "loss": 0.0004, "reward": 3.3183414936065674, "reward_std": 0.2359558790922165, "rewards/final_reward": 1.1896120618016504, "rewards/mask_iou_reward": 0.5948060309008252, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3183414936065674, "rewards/thk_ans_format_reward": 1.0, "step": 529, "think_completion_length": 66.6875 }, { "clip_ratio": 0.0, "completion_length": 135.296875, "epoch": 0.893760539629005, "grad_norm": 7.6705294118384275, "kl": 0.3984375, "learning_rate": 8.21247892074199e-07, "loss": 0.0004, "reward": 3.419031500816345, "reward_std": 0.2190863024443388, "rewards/final_reward": 1.1311254608719006, "rewards/mask_iou_reward": 0.5655627304359503, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.41903156042099, "rewards/thk_ans_format_reward": 1.0, "step": 530, "think_completion_length": 64.96875 }, { "clip_ratio": 0.0, "completion_length": 164.078125, "epoch": 0.8954468802698144, "grad_norm": 20.472054925385102, "kl": 0.3505859375, "learning_rate": 8.209106239460371e-07, "loss": 0.0003, "reward": 3.381049633026123, "reward_std": 0.23917018435895443, "rewards/final_reward": 1.5125118845948695, "rewards/mask_iou_reward": 0.7562559422974348, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3810496926307678, "rewards/thk_ans_format_reward": 1.0, "step": 531, "think_completion_length": 64.1875 }, { "clip_ratio": 0.0, "completion_length": 182.328125, "epoch": 0.897133220910624, "grad_norm": 3.8168364492030333, "kl": 0.3720703125, "learning_rate": 8.205733558178752e-07, "loss": 0.0003, "reward": 3.023830771446228, "reward_std": 0.169752950896509, "rewards/final_reward": 1.1518864899917212, "rewards/mask_iou_reward": 0.5759432449958606, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 1.0707058385014534, "rewards/thk_ans_format_reward": 0.984375, "step": 532, "think_completion_length": 64.9375 }, { "clip_ratio": 0.0, "completion_length": 169.703125, "epoch": 0.8988195615514334, "grad_norm": 15.967853474424993, "kl": 0.384765625, "learning_rate": 8.202360876897133e-07, "loss": 0.0004, "reward": 2.7728532552719116, "reward_std": 0.139126755297184, "rewards/final_reward": 0.5571142070550955, "rewards/mask_iou_reward": 0.27855710352754776, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.772853247821331, "rewards/thk_ans_format_reward": 1.0, "step": 533, "think_completion_length": 76.09375 }, { "clip_ratio": 0.0, "completion_length": 157.515625, "epoch": 0.9005059021922428, "grad_norm": 14.985099224834219, "kl": 1.0390625, "learning_rate": 8.198988195615514e-07, "loss": 0.001, "reward": 2.8130897283554077, "reward_std": 0.22141174226999283, "rewards/final_reward": 0.3782079504236656, "rewards/mask_iou_reward": 0.1891039752118328, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8130897581577301, "rewards/thk_ans_format_reward": 1.0, "step": 534, "think_completion_length": 65.5625 }, { "clip_ratio": 0.0, "completion_length": 150.171875, "epoch": 0.9021922428330523, "grad_norm": 4.841638655066179, "kl": 0.3994140625, "learning_rate": 8.195615514333894e-07, "loss": 0.0004, "reward": 2.7947330474853516, "reward_std": 0.16086240857839584, "rewards/final_reward": 0.5827494551033152, "rewards/mask_iou_reward": 0.2913747275516576, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.794733077287674, "rewards/thk_ans_format_reward": 1.0, "step": 535, "think_completion_length": 73.25 }, { "clip_ratio": 0.0, "completion_length": 165.234375, "epoch": 0.9038785834738617, "grad_norm": 3.6751630699269984, "kl": 0.4228515625, "learning_rate": 8.192242833052276e-07, "loss": 0.0004, "reward": 3.0260074138641357, "reward_std": 0.11613265797495842, "rewards/final_reward": 0.7657504846263233, "rewards/mask_iou_reward": 0.38287524231316167, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.041632503271103, "rewards/thk_ans_format_reward": 0.984375, "step": 536, "think_completion_length": 64.34375 }, { "clip_ratio": 0.0, "completion_length": 135.515625, "epoch": 0.9055649241146712, "grad_norm": 4.476174693329742, "kl": 0.453125, "learning_rate": 8.188870151770657e-07, "loss": 0.0004, "reward": 3.5284619331359863, "reward_std": 0.05958326905965805, "rewards/final_reward": 1.8054081588094992, "rewards/mask_iou_reward": 0.9027040794047496, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5284621119499207, "rewards/thk_ans_format_reward": 1.0, "step": 537, "think_completion_length": 65.59375 }, { "clip_ratio": 0.0, "completion_length": 142.5625, "epoch": 0.9072512647554806, "grad_norm": 4.295697569667424, "kl": 0.369140625, "learning_rate": 8.185497470489039e-07, "loss": 0.0004, "reward": 2.8253973722457886, "reward_std": 0.17487270198762417, "rewards/final_reward": 0.36516283126835664, "rewards/mask_iou_reward": 0.18258141563417832, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8253973722457886, "rewards/thk_ans_format_reward": 1.0, "step": 538, "think_completion_length": 71.1875 }, { "clip_ratio": 0.0, "completion_length": 135.59375, "epoch": 0.9089376053962901, "grad_norm": 12.537328062284256, "kl": 0.380859375, "learning_rate": 8.18212478920742e-07, "loss": 0.0004, "reward": 3.381394863128662, "reward_std": 0.16334578022360802, "rewards/final_reward": 1.6468299087358207, "rewards/mask_iou_reward": 0.8234149543679103, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.381394863128662, "rewards/thk_ans_format_reward": 1.0, "step": 539, "think_completion_length": 67.1875 }, { "clip_ratio": 0.0, "completion_length": 171.0625, "epoch": 0.9106239460370995, "grad_norm": 5.631609952798637, "kl": 0.373046875, "learning_rate": 8.178752107925801e-07, "loss": 0.0004, "reward": 3.1018731594085693, "reward_std": 0.4340359643101692, "rewards/final_reward": 1.2089573280546717, "rewards/mask_iou_reward": 0.6044786640273359, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.1487482786178589, "rewards/thk_ans_format_reward": 0.96875, "step": 540, "think_completion_length": 70.40625 }, { "clip_ratio": 0.0, "completion_length": 143.375, "epoch": 0.9123102866779089, "grad_norm": 6.251334882971103, "kl": 0.3701171875, "learning_rate": 8.175379426644183e-07, "loss": 0.0004, "reward": 3.3850208520889282, "reward_std": 0.15002886205911636, "rewards/final_reward": 1.3596994323781733, "rewards/mask_iou_reward": 0.6798497161890866, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3850208520889282, "rewards/thk_ans_format_reward": 1.0, "step": 541, "think_completion_length": 70.28125 }, { "clip_ratio": 0.0, "completion_length": 139.5625, "epoch": 0.9139966273187183, "grad_norm": 6.1318709306129255, "kl": 0.3701171875, "learning_rate": 8.172006745362563e-07, "loss": 0.0004, "reward": 3.1013495922088623, "reward_std": 0.48096051812171936, "rewards/final_reward": 1.24424121940097, "rewards/mask_iou_reward": 0.622120609700485, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1013494729995728, "rewards/thk_ans_format_reward": 1.0, "step": 542, "think_completion_length": 67.1875 }, { "clip_ratio": 0.0, "completion_length": 158.40625, "epoch": 0.9156829679595279, "grad_norm": 10.854889940904636, "kl": 0.298828125, "learning_rate": 8.168634064080943e-07, "loss": 0.0003, "reward": 3.2696839570999146, "reward_std": 0.17891769856214523, "rewards/final_reward": 1.592977107171023, "rewards/mask_iou_reward": 0.7964885535855115, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.269684076309204, "rewards/thk_ans_format_reward": 1.0, "step": 543, "think_completion_length": 70.1875 }, { "clip_ratio": 0.0, "completion_length": 141.78125, "epoch": 0.9173693086003373, "grad_norm": 4.23331281750137, "kl": 0.369140625, "learning_rate": 8.165261382799325e-07, "loss": 0.0004, "reward": 3.4304239749908447, "reward_std": 0.1314825750887394, "rewards/final_reward": 1.207980122650095, "rewards/mask_iou_reward": 0.6039900613250475, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4304240345954895, "rewards/thk_ans_format_reward": 1.0, "step": 544, "think_completion_length": 70.71875 }, { "clip_ratio": 0.0, "completion_length": 152.265625, "epoch": 0.9190556492411467, "grad_norm": 7.031147279257311, "kl": 0.3369140625, "learning_rate": 8.161888701517706e-07, "loss": 0.0003, "reward": 3.361212372779846, "reward_std": 0.21165333688259125, "rewards/final_reward": 1.2343404362180397, "rewards/mask_iou_reward": 0.6171702181090198, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3612123727798462, "rewards/thk_ans_format_reward": 1.0, "step": 545, "think_completion_length": 78.5625 }, { "clip_ratio": 0.0, "completion_length": 147.0, "epoch": 0.9207419898819561, "grad_norm": 5.202780347128779, "kl": 0.3369140625, "learning_rate": 8.158516020236087e-07, "loss": 0.0003, "reward": 3.3819774389266968, "reward_std": 0.21501677110791206, "rewards/final_reward": 1.4839641986939198, "rewards/mask_iou_reward": 0.7419820993469599, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3819775581359863, "rewards/thk_ans_format_reward": 1.0, "step": 546, "think_completion_length": 73.15625 }, { "clip_ratio": 0.0, "completion_length": 155.484375, "epoch": 0.9224283305227656, "grad_norm": 14.149938755345266, "kl": 0.369140625, "learning_rate": 8.155143338954469e-07, "loss": 0.0004, "reward": 3.032560348510742, "reward_std": 0.3051258474588394, "rewards/final_reward": 1.182298730010252, "rewards/mask_iou_reward": 0.591149365005126, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0325604677200317, "rewards/thk_ans_format_reward": 1.0, "step": 547, "think_completion_length": 79.375 }, { "clip_ratio": 0.0, "completion_length": 155.0625, "epoch": 0.924114671163575, "grad_norm": 7.103217899009958, "kl": 0.3955078125, "learning_rate": 8.15177065767285e-07, "loss": 0.0004, "reward": 3.4782878160476685, "reward_std": 0.29576554894447327, "rewards/final_reward": 1.7047875744492655, "rewards/mask_iou_reward": 0.8523937872246328, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4782878160476685, "rewards/thk_ans_format_reward": 1.0, "step": 548, "think_completion_length": 75.125 }, { "clip_ratio": 0.0, "completion_length": 148.96875, "epoch": 0.9258010118043845, "grad_norm": 4.674635909867585, "kl": 0.3935546875, "learning_rate": 8.148397976391231e-07, "loss": 0.0004, "reward": 3.0649091005325317, "reward_std": 0.10538779571652412, "rewards/final_reward": 1.4885215280987056, "rewards/mask_iou_reward": 0.7442607640493528, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0649092197418213, "rewards/thk_ans_format_reward": 1.0, "step": 549, "think_completion_length": 77.21875 }, { "clip_ratio": 0.0, "completion_length": 149.578125, "epoch": 0.927487352445194, "grad_norm": 10.824977561332439, "kl": 0.392578125, "learning_rate": 8.145025295109613e-07, "loss": 0.0004, "reward": 2.901737332344055, "reward_std": 0.2872447445988655, "rewards/final_reward": 0.3606779283185213, "rewards/mask_iou_reward": 0.18033896415926065, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9017373025417328, "rewards/thk_ans_format_reward": 1.0, "step": 550, "think_completion_length": 80.90625 }, { "clip_ratio": 0.0, "completion_length": 152.140625, "epoch": 0.9291736930860034, "grad_norm": 3.335434593783744, "kl": 0.3525390625, "learning_rate": 8.141652613827992e-07, "loss": 0.0004, "reward": 2.98220694065094, "reward_std": 0.1780674085021019, "rewards/final_reward": 0.9759100328842547, "rewards/mask_iou_reward": 0.48795501644212735, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9822070002555847, "rewards/thk_ans_format_reward": 1.0, "step": 551, "think_completion_length": 86.5 }, { "clip_ratio": 0.0, "completion_length": 152.296875, "epoch": 0.9308600337268128, "grad_norm": 13.530986743553575, "kl": 0.3818359375, "learning_rate": 8.138279932546373e-07, "loss": 0.0004, "reward": 3.046547293663025, "reward_std": 0.38355183601379395, "rewards/final_reward": 0.5386104314584983, "rewards/mask_iou_reward": 0.26930521572924915, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0465472042560577, "rewards/thk_ans_format_reward": 1.0, "step": 552, "think_completion_length": 81.3125 }, { "clip_ratio": 0.0, "completion_length": 148.84375, "epoch": 0.9325463743676222, "grad_norm": 7.018348869059321, "kl": 0.37890625, "learning_rate": 8.134907251264755e-07, "loss": 0.0004, "reward": 3.382186532020569, "reward_std": 0.1798281967639923, "rewards/final_reward": 1.0075253248878073, "rewards/mask_iou_reward": 0.5037626624439037, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3821865320205688, "rewards/thk_ans_format_reward": 1.0, "step": 553, "think_completion_length": 78.03125 }, { "clip_ratio": 0.0, "completion_length": 149.578125, "epoch": 0.9342327150084317, "grad_norm": 6.620793871261661, "kl": 0.3818359375, "learning_rate": 8.131534569983136e-07, "loss": 0.0004, "reward": 3.33063006401062, "reward_std": 0.20287129282951355, "rewards/final_reward": 0.9096661811312425, "rewards/mask_iou_reward": 0.45483309056562127, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3306299448013306, "rewards/thk_ans_format_reward": 1.0, "step": 554, "think_completion_length": 76.9375 }, { "clip_ratio": 0.0, "completion_length": 149.890625, "epoch": 0.9359190556492412, "grad_norm": 5.218423197482733, "kl": 0.35546875, "learning_rate": 8.128161888701517e-07, "loss": 0.0004, "reward": 2.9394739866256714, "reward_std": 0.4641287475824356, "rewards/final_reward": 0.8743346962894474, "rewards/mask_iou_reward": 0.4371673481447237, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9394739866256714, "rewards/thk_ans_format_reward": 1.0, "step": 555, "think_completion_length": 84.1875 }, { "clip_ratio": 0.0, "completion_length": 166.984375, "epoch": 0.9376053962900506, "grad_norm": 18.845803308886303, "kl": 0.3994140625, "learning_rate": 8.124789207419899e-07, "loss": 0.0004, "reward": 3.325919985771179, "reward_std": 0.30405908077955246, "rewards/final_reward": 1.2264077496832324, "rewards/mask_iou_reward": 0.6132038748416162, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.325919896364212, "rewards/thk_ans_format_reward": 1.0, "step": 556, "think_completion_length": 78.90625 }, { "clip_ratio": 0.0, "completion_length": 215.328125, "epoch": 0.93929173693086, "grad_norm": 5.499154937677363, "kl": 0.34765625, "learning_rate": 8.12141652613828e-07, "loss": 0.0003, "reward": 2.9827044010162354, "reward_std": 0.350845642387867, "rewards/final_reward": 0.6183570090078176, "rewards/mask_iou_reward": 0.3091785045039088, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.0139543414115906, "rewards/thk_ans_format_reward": 0.984375, "step": 557, "think_completion_length": 76.625 }, { "clip_ratio": 0.0, "completion_length": 142.890625, "epoch": 0.9409780775716695, "grad_norm": 8.221926187139507, "kl": 0.3671875, "learning_rate": 8.118043844856661e-07, "loss": 0.0004, "reward": 3.4161949157714844, "reward_std": 0.24237601598724723, "rewards/final_reward": 1.3035412984268884, "rewards/mask_iou_reward": 0.6517706492134442, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4161949753761292, "rewards/thk_ans_format_reward": 1.0, "step": 558, "think_completion_length": 77.0625 }, { "clip_ratio": 0.0, "completion_length": 154.078125, "epoch": 0.9426644182124789, "grad_norm": 4.669917534704876, "kl": 0.3662109375, "learning_rate": 8.114671163575043e-07, "loss": 0.0004, "reward": 2.6135934591293335, "reward_std": 0.38687272369861603, "rewards/final_reward": 0.2619348523224978, "rewards/mask_iou_reward": 0.1309674261612489, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6135933995246887, "rewards/thk_ans_format_reward": 1.0, "step": 559, "think_completion_length": 83.40625 }, { "clip_ratio": 0.0, "completion_length": 174.140625, "epoch": 0.9443507588532883, "grad_norm": 5.903891117155913, "kl": 0.35546875, "learning_rate": 8.111298482293422e-07, "loss": 0.0004, "reward": 3.100181221961975, "reward_std": 0.18918309919536114, "rewards/final_reward": 1.320369290484187, "rewards/mask_iou_reward": 0.6601846452420935, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1001812517642975, "rewards/thk_ans_format_reward": 1.0, "step": 560, "think_completion_length": 88.40625 }, { "clip_ratio": 0.0, "completion_length": 155.8125, "epoch": 0.9460370994940978, "grad_norm": 5.2513254819440265, "kl": 0.40625, "learning_rate": 8.107925801011804e-07, "loss": 0.0004, "reward": 3.007655143737793, "reward_std": 0.1493750810623169, "rewards/final_reward": 1.0473003270407384, "rewards/mask_iou_reward": 0.5236501635203692, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0076551735401154, "rewards/thk_ans_format_reward": 1.0, "step": 561, "think_completion_length": 80.375 }, { "clip_ratio": 0.0, "completion_length": 174.5, "epoch": 0.9477234401349073, "grad_norm": 6.63392999354413, "kl": 0.357421875, "learning_rate": 8.104553119730185e-07, "loss": 0.0004, "reward": 3.241023063659668, "reward_std": 0.3681875765323639, "rewards/final_reward": 1.285415503132128, "rewards/mask_iou_reward": 0.642707751566064, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.2566478848457336, "rewards/thk_ans_format_reward": 1.0, "step": 562, "think_completion_length": 86.8125 }, { "clip_ratio": 0.0, "completion_length": 148.90625, "epoch": 0.9494097807757167, "grad_norm": 5.375862707812532, "kl": 0.3525390625, "learning_rate": 8.101180438448566e-07, "loss": 0.0004, "reward": 2.9660009145736694, "reward_std": 0.22842106223106384, "rewards/final_reward": 0.3887532703580729, "rewards/mask_iou_reward": 0.19437663517903644, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9660007953643799, "rewards/thk_ans_format_reward": 1.0, "step": 563, "think_completion_length": 77.96875 }, { "clip_ratio": 0.0, "completion_length": 147.3125, "epoch": 0.9510961214165261, "grad_norm": 5.006303669029434, "kl": 0.685546875, "learning_rate": 8.097807757166948e-07, "loss": 0.0007, "reward": 2.836309552192688, "reward_std": 0.33161526918411255, "rewards/final_reward": 0.3801896632615437, "rewards/mask_iou_reward": 0.19009483163077184, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8363094627857208, "rewards/thk_ans_format_reward": 1.0, "step": 564, "think_completion_length": 80.96875 }, { "clip_ratio": 0.0, "completion_length": 160.359375, "epoch": 0.9527824620573356, "grad_norm": 6.971235012770987, "kl": 0.369140625, "learning_rate": 8.094435075885329e-07, "loss": 0.0004, "reward": 3.0500903129577637, "reward_std": 0.21272655948996544, "rewards/final_reward": 0.5225397620726526, "rewards/mask_iou_reward": 0.2612698810363263, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0500901937484741, "rewards/thk_ans_format_reward": 1.0, "step": 565, "think_completion_length": 71.65625 }, { "clip_ratio": 0.0, "completion_length": 157.4375, "epoch": 0.954468802698145, "grad_norm": 4.573759651160915, "kl": 0.375, "learning_rate": 8.09106239460371e-07, "loss": 0.0004, "reward": 2.946573495864868, "reward_std": 0.24175241217017174, "rewards/final_reward": 1.4133046233131688, "rewards/mask_iou_reward": 0.7066523116565844, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9465733766555786, "rewards/thk_ans_format_reward": 1.0, "step": 566, "think_completion_length": 83.03125 }, { "clip_ratio": 0.0, "completion_length": 158.59375, "epoch": 0.9561551433389545, "grad_norm": 7.187812157503615, "kl": 0.3544921875, "learning_rate": 8.087689713322092e-07, "loss": 0.0004, "reward": 3.17309832572937, "reward_std": 0.38770583271980286, "rewards/final_reward": 1.101495376355855, "rewards/mask_iou_reward": 0.5507476881779275, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1730982661247253, "rewards/thk_ans_format_reward": 1.0, "step": 567, "think_completion_length": 70.0 }, { "clip_ratio": 0.0, "completion_length": 147.390625, "epoch": 0.9578414839797639, "grad_norm": 7.228857272066258, "kl": 0.3603515625, "learning_rate": 8.084317032040471e-07, "loss": 0.0004, "reward": 3.3016059398651123, "reward_std": 0.2917497009038925, "rewards/final_reward": 1.6633718802454966, "rewards/mask_iou_reward": 0.8316859401227483, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3016058802604675, "rewards/thk_ans_format_reward": 1.0, "step": 568, "think_completion_length": 74.125 }, { "clip_ratio": 0.0, "completion_length": 163.96875, "epoch": 0.9595278246205734, "grad_norm": 5.464540876657938, "kl": 0.3779296875, "learning_rate": 8.080944350758852e-07, "loss": 0.0004, "reward": 2.973666191101074, "reward_std": 0.5053753107786179, "rewards/final_reward": 1.1511587511836887, "rewards/mask_iou_reward": 0.5755793755918444, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9736661314964294, "rewards/thk_ans_format_reward": 1.0, "step": 569, "think_completion_length": 82.875 }, { "clip_ratio": 0.0, "completion_length": 141.984375, "epoch": 0.9612141652613828, "grad_norm": 7.821684462869653, "kl": 0.4404296875, "learning_rate": 8.077571669477234e-07, "loss": 0.0004, "reward": 3.028488874435425, "reward_std": 0.34643781185150146, "rewards/final_reward": 1.6624882837630195, "rewards/mask_iou_reward": 0.8312441418815097, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0284889936447144, "rewards/thk_ans_format_reward": 1.0, "step": 570, "think_completion_length": 67.125 }, { "clip_ratio": 0.0, "completion_length": 135.71875, "epoch": 0.9629005059021922, "grad_norm": 7.693091615022995, "kl": 0.375, "learning_rate": 8.074198988195615e-07, "loss": 0.0004, "reward": 2.7286545038223267, "reward_std": 0.30753113329410553, "rewards/final_reward": 1.0370801433569967, "rewards/mask_iou_reward": 0.5185400716784984, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7286544442176819, "rewards/thk_ans_format_reward": 1.0, "step": 571, "think_completion_length": 66.53125 }, { "clip_ratio": 0.0, "completion_length": 136.78125, "epoch": 0.9645868465430016, "grad_norm": 9.500808036106628, "kl": 0.384765625, "learning_rate": 8.070826306913996e-07, "loss": 0.0004, "reward": 2.7350605726242065, "reward_std": 0.19486035406589508, "rewards/final_reward": 0.5022729043104245, "rewards/mask_iou_reward": 0.25113645215521224, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7350606322288513, "rewards/thk_ans_format_reward": 1.0, "step": 572, "think_completion_length": 67.5 }, { "clip_ratio": 0.0, "completion_length": 140.625, "epoch": 0.9662731871838112, "grad_norm": 14.562869249290125, "kl": 0.380859375, "learning_rate": 8.067453625632378e-07, "loss": 0.0004, "reward": 3.1157146692276, "reward_std": 0.21578150242567062, "rewards/final_reward": 1.424389242246907, "rewards/mask_iou_reward": 0.7121946211234536, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1313397586345673, "rewards/thk_ans_format_reward": 0.984375, "step": 573, "think_completion_length": 69.84375 }, { "clip_ratio": 0.0, "completion_length": 149.921875, "epoch": 0.9679595278246206, "grad_norm": 4.906234454731228, "kl": 0.396484375, "learning_rate": 8.064080944350759e-07, "loss": 0.0004, "reward": 2.532925605773926, "reward_std": 0.20672988891601562, "rewards/final_reward": 0.9013923587631087, "rewards/mask_iou_reward": 0.6365832389192185, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5329254902899265, "rewards/thk_ans_format_reward": 1.0, "step": 574, "think_completion_length": 66.53125 }, { "clip_ratio": 0.0, "completion_length": 195.46875, "epoch": 0.96964586846543, "grad_norm": 11.348233863362653, "kl": 0.453125, "learning_rate": 8.06070826306914e-07, "loss": 0.0005, "reward": 2.809985041618347, "reward_std": 0.24615808576345444, "rewards/final_reward": 1.39664637338225, "rewards/mask_iou_reward": 0.698323186691125, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.8256100416183472, "rewards/thk_ans_format_reward": 1.0, "step": 575, "think_completion_length": 69.25 }, { "clip_ratio": 0.0, "completion_length": 139.09375, "epoch": 0.9713322091062394, "grad_norm": 5.294065865305967, "kl": 0.3916015625, "learning_rate": 8.057335581787521e-07, "loss": 0.0004, "reward": 3.430790901184082, "reward_std": 0.043369969353079796, "rewards/final_reward": 1.1329873898201583, "rewards/mask_iou_reward": 0.5664936949100792, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.430790901184082, "rewards/thk_ans_format_reward": 1.0, "step": 576, "think_completion_length": 66.1875 }, { "clip_ratio": 0.0, "completion_length": 139.59375, "epoch": 0.9730185497470489, "grad_norm": 8.854015009866645, "kl": 0.375, "learning_rate": 8.053962900505901e-07, "loss": 0.0004, "reward": 3.2189637422561646, "reward_std": 0.43901485204696655, "rewards/final_reward": 1.0151315489185706, "rewards/mask_iou_reward": 0.5075657744592853, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2189638018608093, "rewards/thk_ans_format_reward": 1.0, "step": 577, "think_completion_length": 68.8125 }, { "clip_ratio": 0.0, "completion_length": 140.546875, "epoch": 0.9747048903878583, "grad_norm": 4.261687082002958, "kl": 0.392578125, "learning_rate": 8.050590219224282e-07, "loss": 0.0004, "reward": 3.3709945678710938, "reward_std": 0.14033617079257965, "rewards/final_reward": 1.3439573249012957, "rewards/mask_iou_reward": 0.6719786624506479, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3709946870803833, "rewards/thk_ans_format_reward": 1.0, "step": 578, "think_completion_length": 66.34375 }, { "clip_ratio": 0.0, "completion_length": 139.109375, "epoch": 0.9763912310286678, "grad_norm": 4.447207228653514, "kl": 0.388671875, "learning_rate": 8.047217537942664e-07, "loss": 0.0004, "reward": 3.2281277179718018, "reward_std": 0.14848940074443817, "rewards/final_reward": 0.8529733127401289, "rewards/mask_iou_reward": 0.42648665637006444, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2281277775764465, "rewards/thk_ans_format_reward": 1.0, "step": 579, "think_completion_length": 68.34375 }, { "clip_ratio": 0.0, "completion_length": 138.671875, "epoch": 0.9780775716694773, "grad_norm": 4.781897329353487, "kl": 0.392578125, "learning_rate": 8.043844856661045e-07, "loss": 0.0004, "reward": 3.032002568244934, "reward_std": 0.18674946948885918, "rewards/final_reward": 1.2685883990484217, "rewards/mask_iou_reward": 0.6342941995242108, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0320026278495789, "rewards/thk_ans_format_reward": 1.0, "step": 580, "think_completion_length": 62.65625 }, { "clip_ratio": 0.0, "completion_length": 186.578125, "epoch": 0.9797639123102867, "grad_norm": 5.322720152197539, "kl": 0.3583984375, "learning_rate": 8.040472175379426e-07, "loss": 0.0004, "reward": 3.2712482213974, "reward_std": 0.49656783044338226, "rewards/final_reward": 1.682521514658056, "rewards/mask_iou_reward": 0.841260757329028, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2712482213974, "rewards/thk_ans_format_reward": 1.0, "step": 581, "think_completion_length": 67.90625 }, { "clip_ratio": 0.0, "completion_length": 149.59375, "epoch": 0.9814502529510961, "grad_norm": 12.090539903297808, "kl": 0.38671875, "learning_rate": 8.037099494097808e-07, "loss": 0.0004, "reward": 3.437628746032715, "reward_std": 0.28335001319646835, "rewards/final_reward": 1.5924873047351613, "rewards/mask_iou_reward": 0.7962436523675807, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.4532538652420044, "rewards/thk_ans_format_reward": 1.0, "step": 582, "think_completion_length": 70.59375 }, { "clip_ratio": 0.0, "completion_length": 136.765625, "epoch": 0.9831365935919055, "grad_norm": 9.381927842657715, "kl": 0.35546875, "learning_rate": 8.033726812816189e-07, "loss": 0.0004, "reward": 3.397608757019043, "reward_std": 0.22774043679237366, "rewards/final_reward": 1.1310353021720032, "rewards/mask_iou_reward": 0.5655176510860016, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3976088762283325, "rewards/thk_ans_format_reward": 1.0, "step": 583, "think_completion_length": 65.6875 }, { "clip_ratio": 0.0, "completion_length": 145.34375, "epoch": 0.984822934232715, "grad_norm": 5.605649395831915, "kl": 0.38671875, "learning_rate": 8.030354131534569e-07, "loss": 0.0004, "reward": 3.2333011627197266, "reward_std": 0.145121393725276, "rewards/final_reward": 1.088090215710324, "rewards/mask_iou_reward": 0.544045107855162, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2489261627197266, "rewards/thk_ans_format_reward": 0.984375, "step": 584, "think_completion_length": 64.28125 }, { "clip_ratio": 0.0, "completion_length": 135.0625, "epoch": 0.9865092748735245, "grad_norm": 8.106226266357831, "kl": 0.3798828125, "learning_rate": 8.02698145025295e-07, "loss": 0.0004, "reward": 3.3262200355529785, "reward_std": 0.04663046449422836, "rewards/final_reward": 1.5821852953472084, "rewards/mask_iou_reward": 0.7910926476736042, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3262198567390442, "rewards/thk_ans_format_reward": 1.0, "step": 585, "think_completion_length": 68.03125 }, { "clip_ratio": 0.0, "completion_length": 131.375, "epoch": 0.9881956155143339, "grad_norm": 12.60392616979475, "kl": 0.482421875, "learning_rate": 8.023608768971331e-07, "loss": 0.0005, "reward": 3.0508854389190674, "reward_std": 0.16749375313520432, "rewards/final_reward": 0.5494016521565961, "rewards/mask_iou_reward": 0.27470082607829804, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.050885260105133, "rewards/thk_ans_format_reward": 1.0, "step": 586, "think_completion_length": 60.71875 }, { "clip_ratio": 0.0, "completion_length": 145.0, "epoch": 0.9898819561551433, "grad_norm": 4.6943230545014885, "kl": 0.4091796875, "learning_rate": 8.020236087689713e-07, "loss": 0.0004, "reward": 3.140958547592163, "reward_std": 0.11254860181361437, "rewards/final_reward": 1.7400074032806145, "rewards/mask_iou_reward": 0.8700037016403073, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1409586668014526, "rewards/thk_ans_format_reward": 1.0, "step": 587, "think_completion_length": 60.1875 }, { "clip_ratio": 0.0, "completion_length": 140.6875, "epoch": 0.9915682967959528, "grad_norm": 5.5450739698049, "kl": 0.3984375, "learning_rate": 8.016863406408094e-07, "loss": 0.0004, "reward": 2.947129249572754, "reward_std": 0.23405615240335464, "rewards/final_reward": 1.3299888408969076, "rewards/mask_iou_reward": 0.6649944204484538, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9471292495727539, "rewards/thk_ans_format_reward": 1.0, "step": 588, "think_completion_length": 68.75 }, { "clip_ratio": 0.0, "completion_length": 136.046875, "epoch": 0.9932546374367622, "grad_norm": 5.584845044253345, "kl": 0.4248046875, "learning_rate": 8.013490725126475e-07, "loss": 0.0004, "reward": 2.913410186767578, "reward_std": 0.18985669524408877, "rewards/final_reward": 0.5123443019232624, "rewards/mask_iou_reward": 0.2561721509616312, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9134101867675781, "rewards/thk_ans_format_reward": 1.0, "step": 589, "think_completion_length": 64.25 }, { "clip_ratio": 0.0, "completion_length": 149.65625, "epoch": 0.9949409780775716, "grad_norm": 12.922202701045435, "kl": 0.435546875, "learning_rate": 8.010118043844857e-07, "loss": 0.0004, "reward": 2.93517804145813, "reward_std": 0.2607208490371704, "rewards/final_reward": 0.5130544114157121, "rewards/mask_iou_reward": 0.25652720570785603, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9351779818534851, "rewards/thk_ans_format_reward": 1.0, "step": 590, "think_completion_length": 69.5625 }, { "clip_ratio": 0.0, "completion_length": 134.171875, "epoch": 0.9966273187183811, "grad_norm": 4.585499227421089, "kl": 0.5078125, "learning_rate": 8.006745362563238e-07, "loss": 0.0005, "reward": 3.003074288368225, "reward_std": 0.21296508610248566, "rewards/final_reward": 1.2166292128694447, "rewards/mask_iou_reward": 0.6083146064347223, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.003074288368225, "rewards/thk_ans_format_reward": 1.0, "step": 591, "think_completion_length": 64.6875 }, { "clip_ratio": 0.0, "completion_length": 148.50000762939453, "epoch": 0.9983136593591906, "grad_norm": 4.028094910756302, "kl": 0.453125, "learning_rate": 8.003372681281619e-07, "loss": 0.0005, "reward": 2.4843724966049194, "reward_std": 0.13876148965209723, "rewards/final_reward": 0.5210704072103204, "rewards/mask_iou_reward": 0.2605352036051602, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.48437248170375824, "rewards/thk_ans_format_reward": 1.0, "step": 592, "think_completion_length": 61.09375 }, { "clip_ratio": 0.0, "completion_length": 156.203125, "epoch": 1.0016863406408094, "grad_norm": 6.134430951701791, "kl": 0.37890625, "learning_rate": 8e-07, "loss": 0.0004, "reward": 3.165682554244995, "reward_std": 0.2925257980823517, "rewards/final_reward": 0.8199532475510345, "rewards/mask_iou_reward": 0.40997662377551725, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.1813074350357056, "rewards/thk_ans_format_reward": 1.0, "step": 593, "think_completion_length": 58.9375 }, { "clip_ratio": 0.0, "completion_length": 135.171875, "epoch": 1.0033726812816188, "grad_norm": 5.720605963148865, "kl": 0.376953125, "learning_rate": 7.99662731871838e-07, "loss": 0.0004, "reward": 2.8060446977615356, "reward_std": 0.08986183628439903, "rewards/final_reward": 0.6783671900048, "rewards/mask_iou_reward": 0.3391835950024, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.806044727563858, "rewards/thk_ans_format_reward": 1.0, "step": 594, "think_completion_length": 69.125 }, { "clip_ratio": 0.0, "completion_length": 129.59375, "epoch": 1.0050590219224282, "grad_norm": 5.326609065448382, "kl": 0.4091796875, "learning_rate": 7.993254637436761e-07, "loss": 0.0004, "reward": 3.6479886770248413, "reward_std": 0.04161073174327612, "rewards/final_reward": 1.7347050746666608, "rewards/mask_iou_reward": 0.8673525373333304, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6479886770248413, "rewards/thk_ans_format_reward": 1.0, "step": 595, "think_completion_length": 58.5625 }, { "clip_ratio": 0.0, "completion_length": 137.40625, "epoch": 1.0067453625632379, "grad_norm": 4.209252478789583, "kl": 0.49609375, "learning_rate": 7.989881956155143e-07, "loss": 0.0005, "reward": 3.5398935079574585, "reward_std": 0.20502007007598877, "rewards/final_reward": 1.4636818424143263, "rewards/mask_iou_reward": 0.7318409212071632, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.539893388748169, "rewards/thk_ans_format_reward": 1.0, "step": 596, "think_completion_length": 62.40625 }, { "clip_ratio": 0.0, "completion_length": 140.296875, "epoch": 1.0084317032040473, "grad_norm": 3.5878805024775158, "kl": 0.359375, "learning_rate": 7.986509274873524e-07, "loss": 0.0004, "reward": 3.013667583465576, "reward_std": 0.2156200110912323, "rewards/final_reward": 0.5815063242492908, "rewards/mask_iou_reward": 0.2907531621246454, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0136675834655762, "rewards/thk_ans_format_reward": 1.0, "step": 597, "think_completion_length": 66.15625 }, { "clip_ratio": 0.0, "completion_length": 152.03125, "epoch": 1.0101180438448567, "grad_norm": 10.09366079770441, "kl": 0.400390625, "learning_rate": 7.983136593591905e-07, "loss": 0.0004, "reward": 3.603295087814331, "reward_std": 0.24045547097921371, "rewards/final_reward": 1.5487568914734149, "rewards/mask_iou_reward": 0.7743784457367074, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.603295087814331, "rewards/thk_ans_format_reward": 1.0, "step": 598, "think_completion_length": 67.0625 }, { "clip_ratio": 0.0, "completion_length": 153.390625, "epoch": 1.0118043844856661, "grad_norm": 27.303257749207962, "kl": 0.3798828125, "learning_rate": 7.979763912310287e-07, "loss": 0.0004, "reward": 3.058853507041931, "reward_std": 0.2468905746936798, "rewards/final_reward": 0.7482793830128771, "rewards/mask_iou_reward": 0.37413969150643855, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.058853566646576, "rewards/thk_ans_format_reward": 1.0, "step": 599, "think_completion_length": 63.25 }, { "clip_ratio": 0.0, "completion_length": 164.140625, "epoch": 1.0134907251264755, "grad_norm": 8.621046672564987, "kl": 0.388671875, "learning_rate": 7.976391231028668e-07, "loss": 0.0004, "reward": 2.5445148944854736, "reward_std": 0.23673780262470245, "rewards/final_reward": 0.2954952263858441, "rewards/mask_iou_reward": 0.14774761319292204, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5445149838924408, "rewards/thk_ans_format_reward": 1.0, "step": 600, "think_completion_length": 62.84375 }, { "clip_ratio": 0.0, "completion_length": 150.6875, "epoch": 1.015177065767285, "grad_norm": 11.444735510612778, "kl": 0.365234375, "learning_rate": 7.973018549747048e-07, "loss": 0.0004, "reward": 3.351117491722107, "reward_std": 0.16388334333896637, "rewards/final_reward": 1.430642644523778, "rewards/mask_iou_reward": 0.715321322261889, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.351117491722107, "rewards/thk_ans_format_reward": 1.0, "step": 601, "think_completion_length": 60.6875 }, { "clip_ratio": 0.0, "completion_length": 141.546875, "epoch": 1.0168634064080944, "grad_norm": 16.50112903301435, "kl": 0.3955078125, "learning_rate": 7.96964586846543e-07, "loss": 0.0004, "reward": 3.4486085176467896, "reward_std": 0.07140736281871796, "rewards/final_reward": 1.781127874725175, "rewards/mask_iou_reward": 0.8905639373625875, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4486083388328552, "rewards/thk_ans_format_reward": 1.0, "step": 602, "think_completion_length": 59.90625 }, { "clip_ratio": 0.0, "completion_length": 140.640625, "epoch": 1.0185497470489038, "grad_norm": 3.386400756178599, "kl": 0.41015625, "learning_rate": 7.96627318718381e-07, "loss": 0.0004, "reward": 3.1761107444763184, "reward_std": 0.19732992816716433, "rewards/final_reward": 0.6813215000785604, "rewards/mask_iou_reward": 0.3406607500392802, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.176110714673996, "rewards/thk_ans_format_reward": 1.0, "step": 603, "think_completion_length": 58.5625 }, { "clip_ratio": 0.0, "completion_length": 132.984375, "epoch": 1.0202360876897134, "grad_norm": 6.050376141836877, "kl": 0.40625, "learning_rate": 7.962900505902191e-07, "loss": 0.0004, "reward": 3.3012163639068604, "reward_std": 0.2960309898480773, "rewards/final_reward": 1.299787516927388, "rewards/mask_iou_reward": 0.649893758463694, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3012162744998932, "rewards/thk_ans_format_reward": 1.0, "step": 604, "think_completion_length": 59.6875 }, { "clip_ratio": 0.0, "completion_length": 141.40625, "epoch": 1.0219224283305228, "grad_norm": 8.792295259990127, "kl": 0.439453125, "learning_rate": 7.959527824620573e-07, "loss": 0.0004, "reward": 3.051405191421509, "reward_std": 0.2848479002714157, "rewards/final_reward": 1.513040004001537, "rewards/mask_iou_reward": 0.7565200020007685, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0514051914215088, "rewards/thk_ans_format_reward": 1.0, "step": 605, "think_completion_length": 54.3125 }, { "clip_ratio": 0.0, "completion_length": 134.34375, "epoch": 1.0236087689713322, "grad_norm": 9.924707938769188, "kl": 1.1298828125, "learning_rate": 7.956155143338954e-07, "loss": 0.0011, "reward": 3.635921359062195, "reward_std": 0.23812589421868324, "rewards/final_reward": 1.5734499780355082, "rewards/mask_iou_reward": 0.7867249890177541, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.63592129945755, "rewards/thk_ans_format_reward": 1.0, "step": 606, "think_completion_length": 63.84375 }, { "clip_ratio": 0.0, "completion_length": 135.34375, "epoch": 1.0252951096121417, "grad_norm": 5.650344995506986, "kl": 0.400390625, "learning_rate": 7.952782462057335e-07, "loss": 0.0004, "reward": 2.9694302082061768, "reward_std": 0.4676559269428253, "rewards/final_reward": 1.1528093681017233, "rewards/mask_iou_reward": 0.5764046840508616, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9694302976131439, "rewards/thk_ans_format_reward": 1.0, "step": 607, "think_completion_length": 58.5625 }, { "clip_ratio": 0.0, "completion_length": 133.375, "epoch": 1.026981450252951, "grad_norm": 43.31806523897635, "kl": 0.40234375, "learning_rate": 7.949409780775717e-07, "loss": 0.0005, "reward": 3.5301413536071777, "reward_std": 0.11195811629295349, "rewards/final_reward": 1.2349649014730204, "rewards/mask_iou_reward": 0.6174824507365102, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5301413536071777, "rewards/thk_ans_format_reward": 1.0, "step": 608, "think_completion_length": 60.46875 }, { "clip_ratio": 0.0, "completion_length": 131.921875, "epoch": 1.0286677908937605, "grad_norm": 4.963350314889085, "kl": 0.365234375, "learning_rate": 7.946037099494097e-07, "loss": 0.0004, "reward": 3.425765633583069, "reward_std": 0.1056961864233017, "rewards/final_reward": 1.2456830820121185, "rewards/mask_iou_reward": 0.6228415410060593, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4257656931877136, "rewards/thk_ans_format_reward": 1.0, "step": 609, "think_completion_length": 64.34375 }, { "clip_ratio": 0.0, "completion_length": 136.8125, "epoch": 1.03035413153457, "grad_norm": 6.993123062729114, "kl": 0.39453125, "learning_rate": 7.942664418212478e-07, "loss": 0.0004, "reward": 2.9795360565185547, "reward_std": 0.16955439560115337, "rewards/final_reward": 0.9026098214107283, "rewards/mask_iou_reward": 0.45130491070536416, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9795358777046204, "rewards/thk_ans_format_reward": 1.0, "step": 610, "think_completion_length": 69.9375 }, { "clip_ratio": 0.0, "completion_length": 132.453125, "epoch": 1.0320404721753795, "grad_norm": 5.435356630552201, "kl": 0.4072265625, "learning_rate": 7.93929173693086e-07, "loss": 0.0004, "reward": 3.634620428085327, "reward_std": 0.12156452983617783, "rewards/final_reward": 1.7145997847390988, "rewards/mask_iou_reward": 0.8572998923695494, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6346204280853271, "rewards/thk_ans_format_reward": 1.0, "step": 611, "think_completion_length": 62.375 }, { "clip_ratio": 0.0, "completion_length": 132.046875, "epoch": 1.033726812816189, "grad_norm": 5.7166593319996695, "kl": 0.376953125, "learning_rate": 7.93591905564924e-07, "loss": 0.0004, "reward": 3.13163959980011, "reward_std": 0.40263403952121735, "rewards/final_reward": 1.020172167459069, "rewards/mask_iou_reward": 0.5100860837295345, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1316396296024323, "rewards/thk_ans_format_reward": 1.0, "step": 612, "think_completion_length": 64.4375 }, { "clip_ratio": 0.0, "completion_length": 132.921875, "epoch": 1.0354131534569984, "grad_norm": 8.067962248791533, "kl": 0.3857421875, "learning_rate": 7.932546374367622e-07, "loss": 0.0004, "reward": 3.3966641426086426, "reward_std": 0.056119462475180626, "rewards/final_reward": 1.7494821178904116, "rewards/mask_iou_reward": 0.8747410589452058, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3966641426086426, "rewards/thk_ans_format_reward": 1.0, "step": 613, "think_completion_length": 59.40625 }, { "clip_ratio": 0.0, "completion_length": 129.34375, "epoch": 1.0370994940978078, "grad_norm": 7.981542131433182, "kl": 0.42578125, "learning_rate": 7.929173693086003e-07, "loss": 0.0004, "reward": 3.305042862892151, "reward_std": 0.09254418313503265, "rewards/final_reward": 1.3203188567393185, "rewards/mask_iou_reward": 0.6601594283696592, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3050429224967957, "rewards/thk_ans_format_reward": 1.0, "step": 614, "think_completion_length": 58.78125 }, { "clip_ratio": 0.0, "completion_length": 139.359375, "epoch": 1.0387858347386172, "grad_norm": 30.331721779804514, "kl": 0.37890625, "learning_rate": 7.925801011804384e-07, "loss": 0.0005, "reward": 3.3478095531463623, "reward_std": 0.12014642171561718, "rewards/final_reward": 1.7833511407636402, "rewards/mask_iou_reward": 0.8916755703818201, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3478095531463623, "rewards/thk_ans_format_reward": 1.0, "step": 615, "think_completion_length": 65.65625 }, { "clip_ratio": 0.0, "completion_length": 149.453125, "epoch": 1.0404721753794266, "grad_norm": 7.313904503197525, "kl": 0.3759765625, "learning_rate": 7.922428330522766e-07, "loss": 0.0004, "reward": 3.3215017318725586, "reward_std": 0.3130381852388382, "rewards/final_reward": 0.9915974087722843, "rewards/mask_iou_reward": 0.49579870438614215, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3215016722679138, "rewards/thk_ans_format_reward": 1.0, "step": 616, "think_completion_length": 62.34375 }, { "clip_ratio": 0.0, "completion_length": 125.671875, "epoch": 1.042158516020236, "grad_norm": 3.2835782882643687, "kl": 0.3955078125, "learning_rate": 7.919055649241147e-07, "loss": 0.0004, "reward": 3.607258439064026, "reward_std": 0.09734362363815308, "rewards/final_reward": 1.5118354327932084, "rewards/mask_iou_reward": 0.7559177163966042, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6072584390640259, "rewards/thk_ans_format_reward": 1.0, "step": 617, "think_completion_length": 53.4375 }, { "clip_ratio": 0.0, "completion_length": 130.453125, "epoch": 1.0438448566610454, "grad_norm": 12.442324301753523, "kl": 0.4423828125, "learning_rate": 7.915682967959527e-07, "loss": 0.0004, "reward": 3.0813547372817993, "reward_std": 0.25647109746932983, "rewards/final_reward": 0.8053913152295082, "rewards/mask_iou_reward": 0.4026956576147541, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0813546776771545, "rewards/thk_ans_format_reward": 1.0, "step": 618, "think_completion_length": 55.53125 }, { "clip_ratio": 0.0, "completion_length": 135.21875, "epoch": 1.045531197301855, "grad_norm": 4.388943716983332, "kl": 0.44140625, "learning_rate": 7.912310286677909e-07, "loss": 0.0004, "reward": 3.1991811990737915, "reward_std": 0.06475630914792418, "rewards/final_reward": 1.6401812516112466, "rewards/mask_iou_reward": 0.8200906258056233, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1991811692714691, "rewards/thk_ans_format_reward": 1.0, "step": 619, "think_completion_length": 55.4375 }, { "clip_ratio": 0.0, "completion_length": 126.640625, "epoch": 1.0472175379426645, "grad_norm": 6.518516562698951, "kl": 0.4638671875, "learning_rate": 7.90893760539629e-07, "loss": 0.0005, "reward": 3.4357359409332275, "reward_std": 0.19499383866786957, "rewards/final_reward": 0.9589320952628577, "rewards/mask_iou_reward": 0.47946604763142886, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4357359409332275, "rewards/thk_ans_format_reward": 1.0, "step": 620, "think_completion_length": 59.09375 }, { "clip_ratio": 0.0, "completion_length": 131.78125, "epoch": 1.048903878583474, "grad_norm": 3.5479767787749883, "kl": 0.37109375, "learning_rate": 7.90556492411467e-07, "loss": 0.0004, "reward": 3.5422295331954956, "reward_std": 0.20139742642641068, "rewards/final_reward": 1.6928670840316493, "rewards/mask_iou_reward": 0.8464335420158247, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.54222971200943, "rewards/thk_ans_format_reward": 1.0, "step": 621, "think_completion_length": 60.59375 }, { "clip_ratio": 0.0, "completion_length": 135.171875, "epoch": 1.0505902192242833, "grad_norm": 33.272162033127636, "kl": 0.3916015625, "learning_rate": 7.902192242833052e-07, "loss": 0.0004, "reward": 2.83156681060791, "reward_std": 0.21725196577608585, "rewards/final_reward": 0.8005620578474413, "rewards/mask_iou_reward": 0.40028102892372064, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8315666913986206, "rewards/thk_ans_format_reward": 1.0, "step": 622, "think_completion_length": 60.6875 }, { "clip_ratio": 0.0, "completion_length": 136.578125, "epoch": 1.0522765598650927, "grad_norm": 8.5697470890475, "kl": 0.505859375, "learning_rate": 7.898819561551433e-07, "loss": 0.0005, "reward": 3.144253969192505, "reward_std": 0.17730970680713654, "rewards/final_reward": 1.4364145312940275, "rewards/mask_iou_reward": 0.7182072656470138, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.14425390958786, "rewards/thk_ans_format_reward": 1.0, "step": 623, "think_completion_length": 56.96875 }, { "clip_ratio": 0.0, "completion_length": 140.84375, "epoch": 1.0539629005059021, "grad_norm": 13.854537021856972, "kl": 0.501953125, "learning_rate": 7.895446880269814e-07, "loss": 0.0005, "reward": 2.9286797046661377, "reward_std": 0.2590184882283211, "rewards/final_reward": 0.7867915929348831, "rewards/mask_iou_reward": 0.39339579646744155, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9286799728870392, "rewards/thk_ans_format_reward": 1.0, "step": 624, "think_completion_length": 54.09375 }, { "clip_ratio": 0.0, "completion_length": 136.34375, "epoch": 1.0556492411467115, "grad_norm": 8.146735804112318, "kl": 0.38671875, "learning_rate": 7.892074198988196e-07, "loss": 0.0004, "reward": 3.0838944911956787, "reward_std": 0.1183428168296814, "rewards/final_reward": 1.2904961257823027, "rewards/mask_iou_reward": 0.6452480628911513, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0838944911956787, "rewards/thk_ans_format_reward": 1.0, "step": 625, "think_completion_length": 55.0 }, { "clip_ratio": 0.0, "completion_length": 135.234375, "epoch": 1.0573355817875212, "grad_norm": 20.97851972202258, "kl": 0.3642578125, "learning_rate": 7.888701517706576e-07, "loss": 0.0004, "reward": 3.509265661239624, "reward_std": 0.08183963038027287, "rewards/final_reward": 1.1006403752958818, "rewards/mask_iou_reward": 0.5503201876479409, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.509265661239624, "rewards/thk_ans_format_reward": 1.0, "step": 626, "think_completion_length": 57.6875 }, { "clip_ratio": 0.0, "completion_length": 130.375, "epoch": 1.0590219224283306, "grad_norm": 7.9849582432385855, "kl": 0.4345703125, "learning_rate": 7.885328836424957e-07, "loss": 0.0004, "reward": 3.200868010520935, "reward_std": 0.2000262811779976, "rewards/final_reward": 1.1624054937062296, "rewards/mask_iou_reward": 0.5812027468531148, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.216493010520935, "rewards/thk_ans_format_reward": 0.984375, "step": 627, "think_completion_length": 60.125 }, { "clip_ratio": 0.0, "completion_length": 128.328125, "epoch": 1.06070826306914, "grad_norm": 8.670323596103566, "kl": 0.392578125, "learning_rate": 7.881956155143339e-07, "loss": 0.0004, "reward": 3.0339386463165283, "reward_std": 0.07193895429372787, "rewards/final_reward": 0.7449650734008929, "rewards/mask_iou_reward": 0.37248253670044645, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0339386463165283, "rewards/thk_ans_format_reward": 1.0, "step": 628, "think_completion_length": 53.4375 }, { "clip_ratio": 0.0, "completion_length": 126.671875, "epoch": 1.0623946037099494, "grad_norm": 6.489276755856646, "kl": 0.41015625, "learning_rate": 7.87858347386172e-07, "loss": 0.0004, "reward": 3.4266610145568848, "reward_std": 0.335693646222353, "rewards/final_reward": 1.6146912092898873, "rewards/mask_iou_reward": 0.8073456046449436, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.42666095495224, "rewards/thk_ans_format_reward": 1.0, "step": 629, "think_completion_length": 54.8125 }, { "clip_ratio": 0.0, "completion_length": 157.71875, "epoch": 1.0640809443507588, "grad_norm": 16.97609842909034, "kl": 0.404296875, "learning_rate": 7.8752107925801e-07, "loss": 0.0004, "reward": 3.035442352294922, "reward_std": 0.39801885560154915, "rewards/final_reward": 1.6349356163559836, "rewards/mask_iou_reward": 0.8174678081779918, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 1.0979422330856323, "rewards/thk_ans_format_reward": 0.96875, "step": 630, "think_completion_length": 52.1875 }, { "clip_ratio": 0.0, "completion_length": 143.578125, "epoch": 1.0657672849915683, "grad_norm": 15.181511428300269, "kl": 0.580078125, "learning_rate": 7.871838111298482e-07, "loss": 0.0006, "reward": 3.3145382404327393, "reward_std": 0.1567831113934517, "rewards/final_reward": 0.7586362291775279, "rewards/mask_iou_reward": 0.37931811458876397, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3145381212234497, "rewards/thk_ans_format_reward": 1.0, "step": 631, "think_completion_length": 51.09375 }, { "clip_ratio": 0.0, "completion_length": 129.453125, "epoch": 1.0674536256323777, "grad_norm": 9.106384318124793, "kl": 0.380859375, "learning_rate": 7.868465430016863e-07, "loss": 0.0004, "reward": 3.314599871635437, "reward_std": 0.1469927802681923, "rewards/final_reward": 1.1458556851733492, "rewards/mask_iou_reward": 0.5729278425866746, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3145997822284698, "rewards/thk_ans_format_reward": 1.0, "step": 632, "think_completion_length": 54.625 }, { "clip_ratio": 0.0, "completion_length": 126.375, "epoch": 1.069139966273187, "grad_norm": 7.79880255821729, "kl": 0.9228515625, "learning_rate": 7.865092748735244e-07, "loss": 0.0009, "reward": 3.261892080307007, "reward_std": 0.11794530600309372, "rewards/final_reward": 1.251106499385277, "rewards/mask_iou_reward": 0.6255532496926385, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2618920803070068, "rewards/thk_ans_format_reward": 1.0, "step": 633, "think_completion_length": 51.28125 }, { "clip_ratio": 0.0, "completion_length": 121.234375, "epoch": 1.0708263069139967, "grad_norm": 4.452227059679726, "kl": 0.48828125, "learning_rate": 7.861720067453625e-07, "loss": 0.0005, "reward": 3.0262283086776733, "reward_std": 0.09251783415675163, "rewards/final_reward": 1.2590802333169506, "rewards/mask_iou_reward": 0.6295401166584753, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0262282192707062, "rewards/thk_ans_format_reward": 1.0, "step": 634, "think_completion_length": 47.46875 }, { "clip_ratio": 0.0, "completion_length": 133.0625, "epoch": 1.0725126475548061, "grad_norm": 39.93271843995892, "kl": 0.552734375, "learning_rate": 7.858347386172006e-07, "loss": 0.0006, "reward": 3.4057939052581787, "reward_std": 0.12188607268035412, "rewards/final_reward": 1.7272153940200017, "rewards/mask_iou_reward": 0.8636076970100008, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4057939052581787, "rewards/thk_ans_format_reward": 1.0, "step": 635, "think_completion_length": 50.09375 }, { "clip_ratio": 0.0, "completion_length": 141.78125, "epoch": 1.0741989881956155, "grad_norm": 39.57929332449854, "kl": 0.6142578125, "learning_rate": 7.854974704890388e-07, "loss": 0.0006, "reward": 3.00014066696167, "reward_std": 0.4901411384344101, "rewards/final_reward": 1.0535258352398766, "rewards/mask_iou_reward": 0.5267629176199383, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.0157656520605087, "rewards/thk_ans_format_reward": 1.0, "step": 636, "think_completion_length": 50.09375 }, { "clip_ratio": 0.0, "completion_length": 125.453125, "epoch": 1.075885328836425, "grad_norm": 3.306093363331697, "kl": 0.384765625, "learning_rate": 7.851602023608769e-07, "loss": 0.0004, "reward": 2.886072278022766, "reward_std": 0.10245025204494596, "rewards/final_reward": 1.2185875798915173, "rewards/mask_iou_reward": 0.6092937899457587, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8860722482204437, "rewards/thk_ans_format_reward": 1.0, "step": 637, "think_completion_length": 49.78125 }, { "clip_ratio": 0.0, "completion_length": 133.359375, "epoch": 1.0775716694772344, "grad_norm": 279.3506833741868, "kl": 0.427734375, "learning_rate": 7.84822934232715e-07, "loss": 0.0004, "reward": 3.6662213802337646, "reward_std": 0.19164992403239012, "rewards/final_reward": 1.7271686355598388, "rewards/mask_iou_reward": 0.8635843177799194, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6662213802337646, "rewards/thk_ans_format_reward": 1.0, "step": 638, "think_completion_length": 45.96875 }, { "clip_ratio": 0.0, "completion_length": 134.65625, "epoch": 1.0792580101180438, "grad_norm": 6.750675311209432, "kl": 0.408203125, "learning_rate": 7.844856661045531e-07, "loss": 0.0004, "reward": 3.471511483192444, "reward_std": 0.27441119961440563, "rewards/final_reward": 1.466517628389615, "rewards/mask_iou_reward": 0.7332588141948075, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.4871366024017334, "rewards/thk_ans_format_reward": 1.0, "step": 639, "think_completion_length": 50.375 }, { "clip_ratio": 0.0, "completion_length": 117.53125, "epoch": 1.0809443507588532, "grad_norm": 21.646709986864188, "kl": 0.4453125, "learning_rate": 7.841483979763912e-07, "loss": 0.0004, "reward": 2.989328145980835, "reward_std": 0.09891379065811634, "rewards/final_reward": 1.0614732894732377, "rewards/mask_iou_reward": 0.5307366447366189, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9893282353878021, "rewards/thk_ans_format_reward": 1.0, "step": 640, "think_completion_length": 48.09375 }, { "clip_ratio": 0.0, "completion_length": 118.421875, "epoch": 1.0826306913996628, "grad_norm": 5.5497035358072635, "kl": 0.451171875, "learning_rate": 7.838111298482293e-07, "loss": 0.0005, "reward": 3.1036102771759033, "reward_std": 0.06228892970830202, "rewards/final_reward": 0.4459223647664925, "rewards/mask_iou_reward": 0.22296118238324625, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1036102771759033, "rewards/thk_ans_format_reward": 1.0, "step": 641, "think_completion_length": 45.34375 }, { "clip_ratio": 0.0, "completion_length": 123.8125, "epoch": 1.0843170320404723, "grad_norm": 61.0294153640609, "kl": 0.431640625, "learning_rate": 7.834738617200675e-07, "loss": 0.0004, "reward": 2.8523221015930176, "reward_std": 0.3196341544389725, "rewards/final_reward": 0.7710843417183383, "rewards/mask_iou_reward": 0.38554217085916914, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8523220717906952, "rewards/thk_ans_format_reward": 1.0, "step": 642, "think_completion_length": 46.9375 }, { "clip_ratio": 0.0, "completion_length": 112.6875, "epoch": 1.0860033726812817, "grad_norm": 35.921443486527984, "kl": 0.4814453125, "learning_rate": 7.831365935919055e-07, "loss": 0.0005, "reward": 3.1496444940567017, "reward_std": 0.19438892230391502, "rewards/final_reward": 1.4102102276977724, "rewards/mask_iou_reward": 0.7051051138488862, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.149644523859024, "rewards/thk_ans_format_reward": 1.0, "step": 643, "think_completion_length": 44.0625 }, { "clip_ratio": 0.0, "completion_length": 118.984375, "epoch": 1.087689713322091, "grad_norm": 4.382258185474288, "kl": 0.44921875, "learning_rate": 7.827993254637436e-07, "loss": 0.0004, "reward": 3.082374095916748, "reward_std": 0.3743949681520462, "rewards/final_reward": 1.017262522169081, "rewards/mask_iou_reward": 0.5086312610845405, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.082374095916748, "rewards/thk_ans_format_reward": 1.0, "step": 644, "think_completion_length": 45.21875 }, { "clip_ratio": 0.0, "completion_length": 116.515625, "epoch": 1.0893760539629005, "grad_norm": 5.795620025146997, "kl": 0.458984375, "learning_rate": 7.824620573355818e-07, "loss": 0.0005, "reward": 3.644650101661682, "reward_std": 0.0579620311036706, "rewards/final_reward": 1.3550629451891312, "rewards/mask_iou_reward": 0.6775314725945656, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6446500420570374, "rewards/thk_ans_format_reward": 1.0, "step": 645, "think_completion_length": 48.125 }, { "clip_ratio": 0.0, "completion_length": 116.71875, "epoch": 1.09106239460371, "grad_norm": 5.570465496238775, "kl": 0.41796875, "learning_rate": 7.821247892074199e-07, "loss": 0.0004, "reward": 3.243638515472412, "reward_std": 0.08751763962209225, "rewards/final_reward": 0.7646918304668557, "rewards/mask_iou_reward": 0.38234591523342787, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.243638515472412, "rewards/thk_ans_format_reward": 1.0, "step": 646, "think_completion_length": 44.8125 }, { "clip_ratio": 0.0, "completion_length": 122.203125, "epoch": 1.0927487352445193, "grad_norm": 6.600286707253367, "kl": 0.4091796875, "learning_rate": 7.81787521079258e-07, "loss": 0.0004, "reward": 3.062265157699585, "reward_std": 0.1764051355421543, "rewards/final_reward": 0.7824303531778339, "rewards/mask_iou_reward": 0.39121517658891697, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0622652620077133, "rewards/thk_ans_format_reward": 1.0, "step": 647, "think_completion_length": 46.0 }, { "clip_ratio": 0.0, "completion_length": 123.21875, "epoch": 1.0944350758853287, "grad_norm": 4.030924908993303, "kl": 0.6171875, "learning_rate": 7.814502529510961e-07, "loss": 0.0006, "reward": 3.7860748767852783, "reward_std": 0.16600636392831802, "rewards/final_reward": 1.9111873242786737, "rewards/mask_iou_reward": 0.9555936621393368, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7860747575759888, "rewards/thk_ans_format_reward": 1.0, "step": 648, "think_completion_length": 39.875 }, { "clip_ratio": 0.0, "completion_length": 113.15625, "epoch": 1.0961214165261384, "grad_norm": 4.2376043097515055, "kl": 0.7412109375, "learning_rate": 7.811129848229342e-07, "loss": 0.0007, "reward": 3.2226240634918213, "reward_std": 0.3179095759987831, "rewards/final_reward": 0.7279156450622776, "rewards/mask_iou_reward": 0.3639578225311388, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.2382490634918213, "rewards/thk_ans_format_reward": 1.0, "step": 649, "think_completion_length": 44.53125 }, { "clip_ratio": 0.0, "completion_length": 110.4375, "epoch": 1.0978077571669478, "grad_norm": 5.523908512555914, "kl": 0.4794921875, "learning_rate": 7.807757166947723e-07, "loss": 0.0005, "reward": 3.006617307662964, "reward_std": 0.14593719691038132, "rewards/final_reward": 1.5455323930270022, "rewards/mask_iou_reward": 0.7727661965135011, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0066173672676086, "rewards/thk_ans_format_reward": 1.0, "step": 650, "think_completion_length": 43.03125 }, { "clip_ratio": 0.0, "completion_length": 118.1875, "epoch": 1.0994940978077572, "grad_norm": 4.917758171979661, "kl": 0.4541015625, "learning_rate": 7.804384485666104e-07, "loss": 0.0005, "reward": 2.8413726091384888, "reward_std": 0.298868240788579, "rewards/final_reward": 1.5816240118600156, "rewards/mask_iou_reward": 0.7908120059300078, "rewards/sam_format_reward": 0.890625, "rewards/sam_reward_func_ultra": 0.9507474899291992, "rewards/thk_ans_format_reward": 1.0, "step": 651, "think_completion_length": 41.28125 }, { "clip_ratio": 0.0, "completion_length": 118.4375, "epoch": 1.1011804384485666, "grad_norm": 8.512332945482656, "kl": 0.4638671875, "learning_rate": 7.801011804384485e-07, "loss": 0.0005, "reward": 2.937049388885498, "reward_std": 0.3492274060845375, "rewards/final_reward": 1.1078227273727301, "rewards/mask_iou_reward": 0.5539113636863651, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 0.9682995975017548, "rewards/thk_ans_format_reward": 1.0, "step": 652, "think_completion_length": 48.90625 }, { "clip_ratio": 0.0, "completion_length": 120.34375, "epoch": 1.102866779089376, "grad_norm": 5.278917106886176, "kl": 0.4736328125, "learning_rate": 7.797639123102866e-07, "loss": 0.0005, "reward": 3.4667773246765137, "reward_std": 0.2953631021082401, "rewards/final_reward": 1.650441733990713, "rewards/mask_iou_reward": 0.8252208669953565, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4667773246765137, "rewards/thk_ans_format_reward": 1.0, "step": 653, "think_completion_length": 41.65625 }, { "clip_ratio": 0.0, "completion_length": 112.734375, "epoch": 1.1045531197301854, "grad_norm": 12.133570183221005, "kl": 0.4599609375, "learning_rate": 7.794266441821248e-07, "loss": 0.0005, "reward": 3.3131359815597534, "reward_std": 0.141848836094141, "rewards/final_reward": 1.1049517799093656, "rewards/mask_iou_reward": 0.5524758899546828, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3131359219551086, "rewards/thk_ans_format_reward": 1.0, "step": 654, "think_completion_length": 45.5625 }, { "clip_ratio": 0.0, "completion_length": 125.75, "epoch": 1.1062394603709949, "grad_norm": 6.928059153280524, "kl": 0.455078125, "learning_rate": 7.790893760539629e-07, "loss": 0.0005, "reward": 3.330763339996338, "reward_std": 0.46064063906669617, "rewards/final_reward": 1.2615539547932235, "rewards/mask_iou_reward": 0.6307769773966118, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 1.3620134592056274, "rewards/thk_ans_format_reward": 1.0, "step": 655, "think_completion_length": 41.25 }, { "clip_ratio": 0.0, "completion_length": 114.625, "epoch": 1.1079258010118043, "grad_norm": 4.186907452762862, "kl": 0.4306640625, "learning_rate": 7.78752107925801e-07, "loss": 0.0004, "reward": 3.546625852584839, "reward_std": 0.20074513833969831, "rewards/final_reward": 1.3358930150040598, "rewards/mask_iou_reward": 0.6679465075020299, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5466259121894836, "rewards/thk_ans_format_reward": 1.0, "step": 656, "think_completion_length": 42.21875 }, { "clip_ratio": 0.0, "completion_length": 113.28125, "epoch": 1.109612141652614, "grad_norm": 6.109888489679801, "kl": 0.421875, "learning_rate": 7.784148397976391e-07, "loss": 0.0004, "reward": 3.2946518659591675, "reward_std": 0.34683462977409363, "rewards/final_reward": 0.8698238201682837, "rewards/mask_iou_reward": 0.4349119100841419, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2946519255638123, "rewards/thk_ans_format_reward": 1.0, "step": 657, "think_completion_length": 41.375 }, { "clip_ratio": 0.0, "completion_length": 118.671875, "epoch": 1.1112984822934233, "grad_norm": 4.5710675246879156, "kl": 0.48046875, "learning_rate": 7.780775716694772e-07, "loss": 0.0005, "reward": 3.563894748687744, "reward_std": 0.2252323478460312, "rewards/final_reward": 1.6749597198477488, "rewards/mask_iou_reward": 0.8374798599238744, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5638948678970337, "rewards/thk_ans_format_reward": 1.0, "step": 658, "think_completion_length": 46.75 }, { "clip_ratio": 0.0, "completion_length": 138.5, "epoch": 1.1129848229342327, "grad_norm": 10.866443084152671, "kl": 0.4345703125, "learning_rate": 7.777403035413152e-07, "loss": 0.0004, "reward": 2.81168270111084, "reward_std": 0.17690950445830822, "rewards/final_reward": 0.8872991301042769, "rewards/mask_iou_reward": 0.44364956505213843, "rewards/sam_format_reward": 0.796875, "rewards/sam_reward_func_ultra": 1.0148076713085175, "rewards/thk_ans_format_reward": 1.0, "step": 659, "think_completion_length": 44.25 }, { "clip_ratio": 0.0, "completion_length": 111.71875, "epoch": 1.1146711635750421, "grad_norm": 22.99738676784577, "kl": 0.4794921875, "learning_rate": 7.774030354131534e-07, "loss": 0.0005, "reward": 3.2107110023498535, "reward_std": 0.24932213127613068, "rewards/final_reward": 1.4678053004170692, "rewards/mask_iou_reward": 0.7339026502085346, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2107109427452087, "rewards/thk_ans_format_reward": 1.0, "step": 660, "think_completion_length": 41.71875 }, { "clip_ratio": 0.0, "completion_length": 192.15625, "epoch": 1.1163575042158516, "grad_norm": 61.78637021385139, "kl": 0.4072265625, "learning_rate": 7.770657672849915e-07, "loss": 0.0005, "reward": 3.1127153635025024, "reward_std": 0.21405567973852158, "rewards/final_reward": 1.4737091512600995, "rewards/mask_iou_reward": 0.7368545756300497, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.11271533370018, "rewards/thk_ans_format_reward": 1.0, "step": 661, "think_completion_length": 40.65625 }, { "clip_ratio": 0.0, "completion_length": 118.125, "epoch": 1.118043844856661, "grad_norm": 6.807626678475442, "kl": 0.525390625, "learning_rate": 7.767284991568297e-07, "loss": 0.0005, "reward": 3.3367409706115723, "reward_std": 0.23176462203264236, "rewards/final_reward": 1.0719849956783123, "rewards/mask_iou_reward": 0.5359924978391561, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3367410898208618, "rewards/thk_ans_format_reward": 1.0, "step": 662, "think_completion_length": 42.71875 }, { "clip_ratio": 0.0, "completion_length": 120.84375, "epoch": 1.1197301854974704, "grad_norm": 10.173401210446599, "kl": 0.455078125, "learning_rate": 7.763912310286678e-07, "loss": 0.0005, "reward": 2.986254930496216, "reward_std": 0.34364715963602066, "rewards/final_reward": 1.1272437953036465, "rewards/mask_iou_reward": 0.5636218976518232, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9862548112869263, "rewards/thk_ans_format_reward": 1.0, "step": 663, "think_completion_length": 42.0625 }, { "clip_ratio": 0.0, "completion_length": 112.265625, "epoch": 1.12141652613828, "grad_norm": 8.303520314295719, "kl": 0.5380859375, "learning_rate": 7.760539629005059e-07, "loss": 0.0005, "reward": 2.907498359680176, "reward_std": 0.31269126385450363, "rewards/final_reward": 0.7283450222985134, "rewards/mask_iou_reward": 0.3641725111492567, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9074984192848206, "rewards/thk_ans_format_reward": 1.0, "step": 664, "think_completion_length": 44.6875 }, { "clip_ratio": 0.0, "completion_length": 135.1875, "epoch": 1.1231028667790894, "grad_norm": 6.347832378595188, "kl": 0.421875, "learning_rate": 7.757166947723441e-07, "loss": 0.0004, "reward": 2.9704482555389404, "reward_std": 0.13693349808454514, "rewards/final_reward": 0.6053038205227258, "rewards/mask_iou_reward": 0.3026519102613629, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9704483151435852, "rewards/thk_ans_format_reward": 1.0, "step": 665, "think_completion_length": 41.5625 }, { "clip_ratio": 0.0, "completion_length": 125.828125, "epoch": 1.1247892074198989, "grad_norm": 8.262589575211235, "kl": 0.423828125, "learning_rate": 7.753794266441821e-07, "loss": 0.0004, "reward": 3.4435518980026245, "reward_std": 0.18118244968354702, "rewards/final_reward": 1.1758351428381058, "rewards/mask_iou_reward": 0.5879175714190529, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4435516595840454, "rewards/thk_ans_format_reward": 1.0, "step": 666, "think_completion_length": 44.8125 }, { "clip_ratio": 0.0, "completion_length": 113.3125, "epoch": 1.1264755480607083, "grad_norm": 11.73547579152421, "kl": 0.4765625, "learning_rate": 7.750421585160201e-07, "loss": 0.0005, "reward": 3.694632411003113, "reward_std": 0.11483868956565857, "rewards/final_reward": 1.6840263089165557, "rewards/mask_iou_reward": 0.8420131544582778, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6946325302124023, "rewards/thk_ans_format_reward": 1.0, "step": 667, "think_completion_length": 45.53125 }, { "clip_ratio": 0.0, "completion_length": 109.546875, "epoch": 1.1281618887015177, "grad_norm": 6.678149708156542, "kl": 0.568359375, "learning_rate": 7.747048903878583e-07, "loss": 0.0006, "reward": 2.7610191106796265, "reward_std": 0.4594908654689789, "rewards/final_reward": 0.8904040467556769, "rewards/mask_iou_reward": 0.44520202337783843, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7610192000865936, "rewards/thk_ans_format_reward": 1.0, "step": 668, "think_completion_length": 39.53125 }, { "clip_ratio": 0.0, "completion_length": 127.96875, "epoch": 1.129848229342327, "grad_norm": 6.113441041476647, "kl": 0.419921875, "learning_rate": 7.743676222596964e-07, "loss": 0.0004, "reward": 3.385975956916809, "reward_std": 0.154005765914917, "rewards/final_reward": 1.206318374353144, "rewards/mask_iou_reward": 0.603159187176572, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3859760165214539, "rewards/thk_ans_format_reward": 1.0, "step": 669, "think_completion_length": 42.84375 }, { "clip_ratio": 0.0, "completion_length": 108.109375, "epoch": 1.1315345699831365, "grad_norm": 4.674232401056913, "kl": 0.4599609375, "learning_rate": 7.740303541315345e-07, "loss": 0.0004, "reward": 3.6678245067596436, "reward_std": 0.02954744128510356, "rewards/final_reward": 1.8396449375646342, "rewards/mask_iou_reward": 0.9198224687823171, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.667824625968933, "rewards/thk_ans_format_reward": 1.0, "step": 670, "think_completion_length": 41.15625 }, { "clip_ratio": 0.0, "completion_length": 111.265625, "epoch": 1.1332209106239461, "grad_norm": 9.631130618354605, "kl": 0.462890625, "learning_rate": 7.736930860033727e-07, "loss": 0.0005, "reward": 3.1673879623413086, "reward_std": 0.046774497255682945, "rewards/final_reward": 1.6174178301439586, "rewards/mask_iou_reward": 0.8087089150719793, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.167387992143631, "rewards/thk_ans_format_reward": 1.0, "step": 671, "think_completion_length": 41.84375 }, { "clip_ratio": 0.0, "completion_length": 110.96875, "epoch": 1.1349072512647556, "grad_norm": 9.80047585453591, "kl": 0.42578125, "learning_rate": 7.733558178752108e-07, "loss": 0.0004, "reward": 3.186527371406555, "reward_std": 0.2629920244216919, "rewards/final_reward": 1.4596871984128676, "rewards/mask_iou_reward": 0.7298435992064338, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1865273118019104, "rewards/thk_ans_format_reward": 1.0, "step": 672, "think_completion_length": 41.0625 }, { "clip_ratio": 0.0, "completion_length": 115.25, "epoch": 1.136593591905565, "grad_norm": 8.232232103136324, "kl": 0.421875, "learning_rate": 7.730185497470489e-07, "loss": 0.0004, "reward": 3.802139163017273, "reward_std": 0.16265618288889527, "rewards/final_reward": 1.928601635210926, "rewards/mask_iou_reward": 0.964300817605463, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.802139163017273, "rewards/thk_ans_format_reward": 1.0, "step": 673, "think_completion_length": 44.625 }, { "clip_ratio": 0.0, "completion_length": 109.90625, "epoch": 1.1382799325463744, "grad_norm": 5.270149298222591, "kl": 0.45703125, "learning_rate": 7.726812816188871e-07, "loss": 0.0005, "reward": 3.2774369716644287, "reward_std": 0.19858654215931892, "rewards/final_reward": 1.1144515948091382, "rewards/mask_iou_reward": 0.5572257974045691, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2774368524551392, "rewards/thk_ans_format_reward": 1.0, "step": 674, "think_completion_length": 33.59375 }, { "clip_ratio": 0.0, "completion_length": 111.375, "epoch": 1.1399662731871838, "grad_norm": 6.837655439090573, "kl": 1.76171875, "learning_rate": 7.72344013490725e-07, "loss": 0.0018, "reward": 3.363555073738098, "reward_std": 0.28620167076587677, "rewards/final_reward": 1.5152913358286324, "rewards/mask_iou_reward": 0.7576456679143162, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3635550737380981, "rewards/thk_ans_format_reward": 1.0, "step": 675, "think_completion_length": 40.625 }, { "clip_ratio": 0.0, "completion_length": 125.203125, "epoch": 1.1416526138279932, "grad_norm": 6.200267618177697, "kl": 0.41796875, "learning_rate": 7.720067453625631e-07, "loss": 0.0004, "reward": 3.6870086193084717, "reward_std": 0.17093387246131897, "rewards/final_reward": 1.5394327535416457, "rewards/mask_iou_reward": 0.7697163767708228, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.702633798122406, "rewards/thk_ans_format_reward": 0.984375, "step": 676, "think_completion_length": 45.125 }, { "clip_ratio": 0.0, "completion_length": 114.09375, "epoch": 1.1433389544688026, "grad_norm": 8.51211024924678, "kl": 0.44140625, "learning_rate": 7.716694772344013e-07, "loss": 0.0004, "reward": 2.8949872255325317, "reward_std": 0.41623103618621826, "rewards/final_reward": 0.2714590533213712, "rewards/mask_iou_reward": 0.1357295266606856, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8949873745441437, "rewards/thk_ans_format_reward": 1.0, "step": 677, "think_completion_length": 43.5 }, { "clip_ratio": 0.0, "completion_length": 118.21875, "epoch": 1.1450252951096123, "grad_norm": 8.563116450543093, "kl": 0.4755859375, "learning_rate": 7.713322091062394e-07, "loss": 0.0005, "reward": 2.552576780319214, "reward_std": 0.3992340862751007, "rewards/final_reward": 0.848194710225629, "rewards/mask_iou_reward": 0.4240973551128145, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5525768399238586, "rewards/thk_ans_format_reward": 1.0, "step": 678, "think_completion_length": 44.0625 }, { "clip_ratio": 0.0, "completion_length": 169.703125, "epoch": 1.1467116357504217, "grad_norm": 6.127334987076423, "kl": 0.4169921875, "learning_rate": 7.709949409780775e-07, "loss": 0.0004, "reward": 3.4885432720184326, "reward_std": 0.19754197634756565, "rewards/final_reward": 1.5432886049163599, "rewards/mask_iou_reward": 0.7716443024581799, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4885433316230774, "rewards/thk_ans_format_reward": 1.0, "step": 679, "think_completion_length": 40.375 }, { "clip_ratio": 0.0, "completion_length": 115.390625, "epoch": 1.148397976391231, "grad_norm": 4.875342612314185, "kl": 0.419921875, "learning_rate": 7.706576728499157e-07, "loss": 0.0004, "reward": 2.6762442588806152, "reward_std": 0.27623605728149414, "rewards/final_reward": 1.3332219909973353, "rewards/mask_iou_reward": 0.6666109954986676, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6762442886829376, "rewards/thk_ans_format_reward": 1.0, "step": 680, "think_completion_length": 46.21875 }, { "clip_ratio": 0.0, "completion_length": 129.375, "epoch": 1.1500843170320405, "grad_norm": 25.542862127950208, "kl": 0.3955078125, "learning_rate": 7.703204047217538e-07, "loss": 0.0004, "reward": 3.241006016731262, "reward_std": 0.1869177557528019, "rewards/final_reward": 1.3966232754602865, "rewards/mask_iou_reward": 0.6983116377301433, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2410059869289398, "rewards/thk_ans_format_reward": 1.0, "step": 681, "think_completion_length": 45.125 }, { "clip_ratio": 0.0, "completion_length": 122.4375, "epoch": 1.15177065767285, "grad_norm": 6.6579015132829715, "kl": 0.4658203125, "learning_rate": 7.699831365935919e-07, "loss": 0.0005, "reward": 3.1924837827682495, "reward_std": 0.29617342352867126, "rewards/final_reward": 1.2179031696155878, "rewards/mask_iou_reward": 0.6089515848077939, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.192483812570572, "rewards/thk_ans_format_reward": 1.0, "step": 682, "think_completion_length": 43.46875 }, { "clip_ratio": 0.0, "completion_length": 114.984375, "epoch": 1.1534569983136593, "grad_norm": 15.655594911719893, "kl": 0.44921875, "learning_rate": 7.6964586846543e-07, "loss": 0.0005, "reward": 3.237351417541504, "reward_std": 0.21372727304697037, "rewards/final_reward": 1.0931966809390936, "rewards/mask_iou_reward": 0.5465983404695468, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2373515665531158, "rewards/thk_ans_format_reward": 1.0, "step": 683, "think_completion_length": 42.5625 }, { "clip_ratio": 0.0, "completion_length": 111.90625, "epoch": 1.1551433389544687, "grad_norm": 5.186245702340746, "kl": 0.4375, "learning_rate": 7.69308600337268e-07, "loss": 0.0004, "reward": 3.1966097354888916, "reward_std": 0.27127550914883614, "rewards/final_reward": 1.0101069758644736, "rewards/mask_iou_reward": 0.5050534879322368, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1966098546981812, "rewards/thk_ans_format_reward": 1.0, "step": 684, "think_completion_length": 39.25 }, { "clip_ratio": 0.0, "completion_length": 124.390625, "epoch": 1.1568296795952782, "grad_norm": 10.1943751403639, "kl": 0.37109375, "learning_rate": 7.689713322091061e-07, "loss": 0.0003, "reward": 2.9875484704971313, "reward_std": 0.1596406251192093, "rewards/final_reward": 1.1998612642817665, "rewards/mask_iou_reward": 0.5999306321408833, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9875486344099045, "rewards/thk_ans_format_reward": 1.0, "step": 685, "think_completion_length": 44.84375 }, { "clip_ratio": 0.0, "completion_length": 118.3125, "epoch": 1.1585160202360876, "grad_norm": 12.939471681884397, "kl": 0.4775390625, "learning_rate": 7.686340640809443e-07, "loss": 0.0005, "reward": 3.0023581981658936, "reward_std": 0.2409796817228198, "rewards/final_reward": 1.081150993998099, "rewards/mask_iou_reward": 0.5405754969990495, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0023581981658936, "rewards/thk_ans_format_reward": 1.0, "step": 686, "think_completion_length": 48.09375 }, { "clip_ratio": 0.0, "completion_length": 120.640625, "epoch": 1.1602023608768972, "grad_norm": 5.825517597219945, "kl": 0.4091796875, "learning_rate": 7.682967959527824e-07, "loss": 0.0004, "reward": 2.737739324569702, "reward_std": 0.36891575902700424, "rewards/final_reward": 0.8585461742314954, "rewards/mask_iou_reward": 0.4292730871157477, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7377393543720245, "rewards/thk_ans_format_reward": 1.0, "step": 687, "think_completion_length": 43.21875 }, { "clip_ratio": 0.0, "completion_length": 129.390625, "epoch": 1.1618887015177066, "grad_norm": 3.7055147438675196, "kl": 0.392578125, "learning_rate": 7.679595278246206e-07, "loss": 0.0004, "reward": 3.136894106864929, "reward_std": 0.44828712940216064, "rewards/final_reward": 1.4373869457406165, "rewards/mask_iou_reward": 0.7186934728703083, "rewards/sam_format_reward": 0.921875, "rewards/sam_reward_func_ultra": 1.2150189876556396, "rewards/thk_ans_format_reward": 1.0, "step": 688, "think_completion_length": 43.78125 }, { "clip_ratio": 0.0, "completion_length": 147.625, "epoch": 1.163575042158516, "grad_norm": 10.711703911868902, "kl": 0.3662109375, "learning_rate": 7.676222596964587e-07, "loss": 0.0004, "reward": 3.549091100692749, "reward_std": 0.0804364699870348, "rewards/final_reward": 1.502469671340829, "rewards/mask_iou_reward": 0.7512348356704145, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5490912199020386, "rewards/thk_ans_format_reward": 1.0, "step": 689, "think_completion_length": 47.90625 }, { "clip_ratio": 0.0, "completion_length": 119.78125, "epoch": 1.1652613827993255, "grad_norm": 14.365059467586935, "kl": 0.505859375, "learning_rate": 7.672849915682968e-07, "loss": 0.0005, "reward": 2.859722137451172, "reward_std": 0.0706297755241394, "rewards/final_reward": 1.3300087058547911, "rewards/mask_iou_reward": 0.6650043529273956, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8597220778465271, "rewards/thk_ans_format_reward": 1.0, "step": 690, "think_completion_length": 48.875 }, { "clip_ratio": 0.0, "completion_length": 137.59375, "epoch": 1.1669477234401349, "grad_norm": 13.83030640010574, "kl": 0.451171875, "learning_rate": 7.66947723440135e-07, "loss": 0.0005, "reward": 3.6626336574554443, "reward_std": 0.08809349499642849, "rewards/final_reward": 1.4519475682557976, "rewards/mask_iou_reward": 0.7259737841278988, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6626335382461548, "rewards/thk_ans_format_reward": 1.0, "step": 691, "think_completion_length": 44.3125 }, { "clip_ratio": 0.0, "completion_length": 115.234375, "epoch": 1.1686340640809443, "grad_norm": 5.7118411670163045, "kl": 0.451171875, "learning_rate": 7.666104553119729e-07, "loss": 0.0005, "reward": 3.2752153873443604, "reward_std": 0.22305817902088165, "rewards/final_reward": 1.0218888606161036, "rewards/mask_iou_reward": 0.5109444303080518, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2752153277397156, "rewards/thk_ans_format_reward": 1.0, "step": 692, "think_completion_length": 42.25 }, { "clip_ratio": 0.0, "completion_length": 112.953125, "epoch": 1.1703204047217537, "grad_norm": 5.879210219257272, "kl": 0.4287109375, "learning_rate": 7.66273187183811e-07, "loss": 0.0004, "reward": 3.4141026735305786, "reward_std": 0.11216456070542336, "rewards/final_reward": 0.9649098707837429, "rewards/mask_iou_reward": 0.48245493539187145, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4141026735305786, "rewards/thk_ans_format_reward": 1.0, "step": 693, "think_completion_length": 44.34375 }, { "clip_ratio": 0.0, "completion_length": 118.65625, "epoch": 1.1720067453625633, "grad_norm": 4.885554768853665, "kl": 0.48828125, "learning_rate": 7.659359190556492e-07, "loss": 0.0005, "reward": 3.471671462059021, "reward_std": 0.023879871238023043, "rewards/final_reward": 1.0082511677110773, "rewards/mask_iou_reward": 0.5041255838555386, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.471671223640442, "rewards/thk_ans_format_reward": 1.0, "step": 694, "think_completion_length": 47.09375 }, { "clip_ratio": 0.0, "completion_length": 117.6875, "epoch": 1.1736930860033727, "grad_norm": 4.265041635129777, "kl": 0.458984375, "learning_rate": 7.655986509274873e-07, "loss": 0.0005, "reward": 2.813219904899597, "reward_std": 0.06556128861848265, "rewards/final_reward": 0.6052844679476623, "rewards/mask_iou_reward": 0.30264223397383117, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8132199048995972, "rewards/thk_ans_format_reward": 1.0, "step": 695, "think_completion_length": 46.15625 }, { "clip_ratio": 0.0, "completion_length": 114.140625, "epoch": 1.1753794266441822, "grad_norm": 5.2388044987299045, "kl": 0.3984375, "learning_rate": 7.652613827993254e-07, "loss": 0.0004, "reward": 3.579859733581543, "reward_std": 0.059681566432118416, "rewards/final_reward": 1.395962637751189, "rewards/mask_iou_reward": 0.6979813188755944, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.579859733581543, "rewards/thk_ans_format_reward": 1.0, "step": 696, "think_completion_length": 39.5625 }, { "clip_ratio": 0.0, "completion_length": 149.078125, "epoch": 1.1770657672849916, "grad_norm": 6.088857849607072, "kl": 0.337890625, "learning_rate": 7.649241146711636e-07, "loss": 0.0003, "reward": 3.479218006134033, "reward_std": 0.1960524395108223, "rewards/final_reward": 1.3349460746520103, "rewards/mask_iou_reward": 0.6674730373260052, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4792180061340332, "rewards/thk_ans_format_reward": 1.0, "step": 697, "think_completion_length": 39.0625 }, { "clip_ratio": 0.0, "completion_length": 117.15625, "epoch": 1.178752107925801, "grad_norm": 6.866854285004716, "kl": 0.4990234375, "learning_rate": 7.645868465430017e-07, "loss": 0.0004, "reward": 3.4718170166015625, "reward_std": 0.23394297808408737, "rewards/final_reward": 1.3271848039220981, "rewards/mask_iou_reward": 0.6635924019610491, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4874420166015625, "rewards/thk_ans_format_reward": 0.984375, "step": 698, "think_completion_length": 43.96875 }, { "clip_ratio": 0.0, "completion_length": 109.921875, "epoch": 1.1804384485666104, "grad_norm": 5.382111118101294, "kl": 0.453125, "learning_rate": 7.642495784148398e-07, "loss": 0.0005, "reward": 3.5155398845672607, "reward_std": 0.23116005957126617, "rewards/final_reward": 1.6273760561564208, "rewards/mask_iou_reward": 0.8136880280782104, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.515539824962616, "rewards/thk_ans_format_reward": 1.0, "step": 699, "think_completion_length": 39.4375 }, { "clip_ratio": 0.0, "completion_length": 118.453125, "epoch": 1.1821247892074198, "grad_norm": 6.276060150190441, "kl": 0.4521484375, "learning_rate": 7.639123102866779e-07, "loss": 0.0005, "reward": 3.605965256690979, "reward_std": 0.26602135598659515, "rewards/final_reward": 1.7934416230025474, "rewards/mask_iou_reward": 0.8967208115012737, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.605965256690979, "rewards/thk_ans_format_reward": 1.0, "step": 700, "think_completion_length": 38.53125 }, { "clip_ratio": 0.0, "completion_length": 110.421875, "epoch": 1.1838111298482294, "grad_norm": 14.606307023698015, "kl": 0.41796875, "learning_rate": 7.635750421585159e-07, "loss": 0.0004, "reward": 2.9312102794647217, "reward_std": 0.1353270411491394, "rewards/final_reward": 0.8034653759214077, "rewards/mask_iou_reward": 0.4017326879607038, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9312102794647217, "rewards/thk_ans_format_reward": 1.0, "step": 701, "think_completion_length": 40.8125 }, { "clip_ratio": 0.0, "completion_length": 113.078125, "epoch": 1.1854974704890389, "grad_norm": 6.197539138196918, "kl": 0.515625, "learning_rate": 7.63237774030354e-07, "loss": 0.0005, "reward": 3.6228188276290894, "reward_std": 0.27496435306966305, "rewards/final_reward": 1.81913042546311, "rewards/mask_iou_reward": 0.909565212731555, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6228187680244446, "rewards/thk_ans_format_reward": 1.0, "step": 702, "think_completion_length": 45.03125 }, { "clip_ratio": 0.0, "completion_length": 108.6875, "epoch": 1.1871838111298483, "grad_norm": 9.608806612788296, "kl": 0.5234375, "learning_rate": 7.629005059021922e-07, "loss": 0.0005, "reward": 3.431188225746155, "reward_std": 0.208267442882061, "rewards/final_reward": 1.4498300708540364, "rewards/mask_iou_reward": 0.7249150354270182, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4311882853507996, "rewards/thk_ans_format_reward": 1.0, "step": 703, "think_completion_length": 39.84375 }, { "clip_ratio": 0.0, "completion_length": 121.9375, "epoch": 1.1888701517706577, "grad_norm": 5.504246849404247, "kl": 0.537109375, "learning_rate": 7.625632377740303e-07, "loss": 0.0005, "reward": 3.801710367202759, "reward_std": 0.17118039727210999, "rewards/final_reward": 1.7455660960035957, "rewards/mask_iou_reward": 0.8727830480017978, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8017104268074036, "rewards/thk_ans_format_reward": 1.0, "step": 704, "think_completion_length": 37.6875 }, { "clip_ratio": 0.0, "completion_length": 135.828125, "epoch": 1.190556492411467, "grad_norm": 11.153695855520748, "kl": 0.525390625, "learning_rate": 7.622259696458684e-07, "loss": 0.0005, "reward": 3.039560914039612, "reward_std": 0.20383157022297382, "rewards/final_reward": 1.255676750998822, "rewards/mask_iou_reward": 0.627838375499411, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0395609438419342, "rewards/thk_ans_format_reward": 1.0, "step": 705, "think_completion_length": 43.71875 }, { "clip_ratio": 0.0, "completion_length": 110.34375, "epoch": 1.1922428330522765, "grad_norm": 11.045631563315313, "kl": 0.4892578125, "learning_rate": 7.618887015177066e-07, "loss": 0.0004, "reward": 3.4573845863342285, "reward_std": 0.01934580714441836, "rewards/final_reward": 1.1347815513852373, "rewards/mask_iou_reward": 0.5673907756926186, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4573845863342285, "rewards/thk_ans_format_reward": 1.0, "step": 706, "think_completion_length": 42.21875 }, { "clip_ratio": 0.0, "completion_length": 115.53125, "epoch": 1.193929173693086, "grad_norm": 6.701662389508587, "kl": 0.546875, "learning_rate": 7.615514333895447e-07, "loss": 0.0005, "reward": 3.5015335083007812, "reward_std": 0.1357786562293768, "rewards/final_reward": 1.0802052054652325, "rewards/mask_iou_reward": 0.5401026027326162, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5015335083007812, "rewards/thk_ans_format_reward": 1.0, "step": 707, "think_completion_length": 47.5 }, { "clip_ratio": 0.0, "completion_length": 109.21875, "epoch": 1.1956155143338956, "grad_norm": 6.3454562069427025, "kl": 0.529296875, "learning_rate": 7.612141652613827e-07, "loss": 0.0005, "reward": 3.2759647369384766, "reward_std": 0.2564007118344307, "rewards/final_reward": 1.594199117179952, "rewards/mask_iou_reward": 0.797099558589976, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2759648561477661, "rewards/thk_ans_format_reward": 1.0, "step": 708, "think_completion_length": 38.875 }, { "clip_ratio": 0.0, "completion_length": 106.96875, "epoch": 1.197301854974705, "grad_norm": 7.93254910361403, "kl": 0.474609375, "learning_rate": 7.608768971332209e-07, "loss": 0.0005, "reward": 3.0473880767822266, "reward_std": 0.2008904181420803, "rewards/final_reward": 0.5035804906873941, "rewards/mask_iou_reward": 0.25179024534369704, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0473880767822266, "rewards/thk_ans_format_reward": 1.0, "step": 709, "think_completion_length": 36.3125 }, { "clip_ratio": 0.0, "completion_length": 107.15625, "epoch": 1.1989881956155144, "grad_norm": 24.9492335778285, "kl": 0.48046875, "learning_rate": 7.605396290050589e-07, "loss": 0.0005, "reward": 3.3758158683776855, "reward_std": 0.176735520362854, "rewards/final_reward": 1.6817929384532593, "rewards/mask_iou_reward": 0.8408964692266296, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3758159279823303, "rewards/thk_ans_format_reward": 1.0, "step": 710, "think_completion_length": 39.59375 }, { "clip_ratio": 0.0, "completion_length": 108.421875, "epoch": 1.2006745362563238, "grad_norm": 6.6202653063539225, "kl": 0.462890625, "learning_rate": 7.602023608768971e-07, "loss": 0.0005, "reward": 3.522263765335083, "reward_std": 0.15052516479045153, "rewards/final_reward": 1.788935256001427, "rewards/mask_iou_reward": 0.8944676280007136, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.522263765335083, "rewards/thk_ans_format_reward": 1.0, "step": 711, "think_completion_length": 38.8125 }, { "clip_ratio": 0.0, "completion_length": 110.171875, "epoch": 1.2023608768971332, "grad_norm": 8.148628385247655, "kl": 0.498046875, "learning_rate": 7.598650927487352e-07, "loss": 0.0005, "reward": 3.5046751499176025, "reward_std": 0.18944399803876877, "rewards/final_reward": 1.4865641360598838, "rewards/mask_iou_reward": 0.7432820680299419, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5046750903129578, "rewards/thk_ans_format_reward": 1.0, "step": 712, "think_completion_length": 39.90625 }, { "clip_ratio": 0.0, "completion_length": 125.8125, "epoch": 1.2040472175379426, "grad_norm": 18.178155199698164, "kl": 0.4921875, "learning_rate": 7.595278246205733e-07, "loss": 0.0005, "reward": 3.5088882446289062, "reward_std": 0.2718805819749832, "rewards/final_reward": 1.3515101299801353, "rewards/mask_iou_reward": 0.6757550649900677, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5088882446289062, "rewards/thk_ans_format_reward": 1.0, "step": 713, "think_completion_length": 38.0 }, { "clip_ratio": 0.0, "completion_length": 110.453125, "epoch": 1.205733558178752, "grad_norm": 4.136701065150466, "kl": 0.5419921875, "learning_rate": 7.591905564924115e-07, "loss": 0.0005, "reward": 3.2674922943115234, "reward_std": 0.12187814339995384, "rewards/final_reward": 0.7277545366620094, "rewards/mask_iou_reward": 0.3638772683310047, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2674922943115234, "rewards/thk_ans_format_reward": 1.0, "step": 714, "think_completion_length": 41.28125 }, { "clip_ratio": 0.0, "completion_length": 110.40625, "epoch": 1.2074198988195615, "grad_norm": 7.655670942990928, "kl": 0.4765625, "learning_rate": 7.588532883642496e-07, "loss": 0.0005, "reward": 3.6942955255508423, "reward_std": 0.2473888397216797, "rewards/final_reward": 1.7120549679054045, "rewards/mask_iou_reward": 0.8560274839527022, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.709920585155487, "rewards/thk_ans_format_reward": 1.0, "step": 715, "think_completion_length": 40.65625 }, { "clip_ratio": 0.0, "completion_length": 110.84375, "epoch": 1.2091062394603709, "grad_norm": 20.231831063772177, "kl": 0.482421875, "learning_rate": 7.585160202360877e-07, "loss": 0.0005, "reward": 2.921970248222351, "reward_std": 0.13596704229712486, "rewards/final_reward": 0.8988025978332418, "rewards/mask_iou_reward": 0.4494012989166209, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9219703376293182, "rewards/thk_ans_format_reward": 1.0, "step": 716, "think_completion_length": 39.78125 }, { "clip_ratio": 0.0, "completion_length": 107.90625, "epoch": 1.2107925801011805, "grad_norm": 8.67676356701758, "kl": 0.46875, "learning_rate": 7.581787521079258e-07, "loss": 0.0005, "reward": 3.2619752883911133, "reward_std": 0.24443187564611435, "rewards/final_reward": 1.20501075584318, "rewards/mask_iou_reward": 0.60250537792159, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2619754672050476, "rewards/thk_ans_format_reward": 1.0, "step": 717, "think_completion_length": 39.75 }, { "clip_ratio": 0.0, "completion_length": 112.078125, "epoch": 1.21247892074199, "grad_norm": 4.620316797154903, "kl": 0.4384765625, "learning_rate": 7.578414839797639e-07, "loss": 0.0004, "reward": 3.1769468784332275, "reward_std": 0.18927378207445145, "rewards/final_reward": 1.619022070116455, "rewards/mask_iou_reward": 0.8095110350582275, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1769469380378723, "rewards/thk_ans_format_reward": 1.0, "step": 718, "think_completion_length": 43.4375 }, { "clip_ratio": 0.0, "completion_length": 116.296875, "epoch": 1.2141652613827993, "grad_norm": 4.793946634699995, "kl": 0.486328125, "learning_rate": 7.575042158516019e-07, "loss": 0.0004, "reward": 3.5579320192337036, "reward_std": 0.04817063407972455, "rewards/final_reward": 1.8388994591552934, "rewards/mask_iou_reward": 0.9194497295776467, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5579320192337036, "rewards/thk_ans_format_reward": 1.0, "step": 719, "think_completion_length": 43.34375 }, { "clip_ratio": 0.0, "completion_length": 115.90625, "epoch": 1.2158516020236088, "grad_norm": 6.354021636595106, "kl": 0.552734375, "learning_rate": 7.571669477234401e-07, "loss": 0.0006, "reward": 3.1674487590789795, "reward_std": 0.14819223433732986, "rewards/final_reward": 0.8902573793870587, "rewards/mask_iou_reward": 0.44512868969352937, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1674485504627228, "rewards/thk_ans_format_reward": 1.0, "step": 720, "think_completion_length": 42.875 }, { "clip_ratio": 0.0, "completion_length": 118.515625, "epoch": 1.2175379426644182, "grad_norm": 10.438214677628338, "kl": 0.4658203125, "learning_rate": 7.568296795952782e-07, "loss": 0.0005, "reward": 3.6784117221832275, "reward_std": 0.18426834791898727, "rewards/final_reward": 1.4776055255912712, "rewards/mask_iou_reward": 0.7388027627956356, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6784114241600037, "rewards/thk_ans_format_reward": 1.0, "step": 721, "think_completion_length": 48.375 }, { "clip_ratio": 0.0, "completion_length": 119.984375, "epoch": 1.2192242833052276, "grad_norm": 79.1925794146455, "kl": 0.4716796875, "learning_rate": 7.564924114671163e-07, "loss": 0.0005, "reward": 3.3132262229919434, "reward_std": 0.05754976533353329, "rewards/final_reward": 1.12228992876371, "rewards/mask_iou_reward": 0.561144964381855, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3132262229919434, "rewards/thk_ans_format_reward": 1.0, "step": 722, "think_completion_length": 46.96875 }, { "clip_ratio": 0.0, "completion_length": 116.15625, "epoch": 1.220910623946037, "grad_norm": 60.04001298477758, "kl": 0.451171875, "learning_rate": 7.561551433389545e-07, "loss": 0.0005, "reward": 3.064196825027466, "reward_std": 0.3282191604375839, "rewards/final_reward": 0.6696442415236867, "rewards/mask_iou_reward": 0.33482212076184337, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.064196765422821, "rewards/thk_ans_format_reward": 1.0, "step": 723, "think_completion_length": 45.78125 }, { "clip_ratio": 0.0, "completion_length": 154.8125, "epoch": 1.2225969645868466, "grad_norm": 8.569614982497429, "kl": 0.4111328125, "learning_rate": 7.558178752107926e-07, "loss": 0.0004, "reward": 3.5133100748062134, "reward_std": 0.053193164989352226, "rewards/final_reward": 1.3529733064799343, "rewards/mask_iou_reward": 0.6764866532399672, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5133100152015686, "rewards/thk_ans_format_reward": 1.0, "step": 724, "think_completion_length": 45.28125 }, { "clip_ratio": 0.0, "completion_length": 119.953125, "epoch": 1.224283305227656, "grad_norm": 9.357989983029983, "kl": 1.125, "learning_rate": 7.554806070826306e-07, "loss": 0.0011, "reward": 3.160015821456909, "reward_std": 0.32910278625786304, "rewards/final_reward": 1.1335708589589917, "rewards/mask_iou_reward": 0.5667854294794958, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.160015881061554, "rewards/thk_ans_format_reward": 1.0, "step": 725, "think_completion_length": 48.96875 }, { "clip_ratio": 0.0, "completion_length": 114.4375, "epoch": 1.2259696458684655, "grad_norm": 5.024754635183879, "kl": 0.478515625, "learning_rate": 7.551433389544688e-07, "loss": 0.0005, "reward": 3.1202811002731323, "reward_std": 0.18038739264011383, "rewards/final_reward": 0.9122605295219147, "rewards/mask_iou_reward": 0.45613026476095736, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1202812194824219, "rewards/thk_ans_format_reward": 1.0, "step": 726, "think_completion_length": 45.21875 }, { "clip_ratio": 0.0, "completion_length": 118.375, "epoch": 1.2276559865092749, "grad_norm": 6.988500133578457, "kl": 0.4287109375, "learning_rate": 7.548060708263069e-07, "loss": 0.0004, "reward": 3.2941445112228394, "reward_std": 0.3624084070324898, "rewards/final_reward": 1.4880099850929445, "rewards/mask_iou_reward": 0.7440049925464722, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2941443920135498, "rewards/thk_ans_format_reward": 1.0, "step": 727, "think_completion_length": 44.75 }, { "clip_ratio": 0.0, "completion_length": 115.53125, "epoch": 1.2293423271500843, "grad_norm": 4.250688941174817, "kl": 0.416015625, "learning_rate": 7.544688026981449e-07, "loss": 0.0004, "reward": 3.2780280113220215, "reward_std": 0.0592149943113327, "rewards/final_reward": 1.6572678327724175, "rewards/mask_iou_reward": 0.8286339163862088, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2780280113220215, "rewards/thk_ans_format_reward": 1.0, "step": 728, "think_completion_length": 49.4375 }, { "clip_ratio": 0.0, "completion_length": 131.875, "epoch": 1.2310286677908937, "grad_norm": 10.70609575483574, "kl": 0.4052734375, "learning_rate": 7.541315345699831e-07, "loss": 0.0004, "reward": 3.305624485015869, "reward_std": 0.20255491137504578, "rewards/final_reward": 1.612820795771751, "rewards/mask_iou_reward": 0.8064103978858755, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.305624544620514, "rewards/thk_ans_format_reward": 1.0, "step": 729, "think_completion_length": 46.59375 }, { "clip_ratio": 0.0, "completion_length": 118.609375, "epoch": 1.2327150084317031, "grad_norm": 19.39244591936634, "kl": 0.4228515625, "learning_rate": 7.537942664418212e-07, "loss": 0.0004, "reward": 3.4895849227905273, "reward_std": 0.35745085775852203, "rewards/final_reward": 1.6071061379815976, "rewards/mask_iou_reward": 0.8035530689907988, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4895849823951721, "rewards/thk_ans_format_reward": 1.0, "step": 730, "think_completion_length": 51.375 }, { "clip_ratio": 0.0, "completion_length": 123.28125, "epoch": 1.2344013490725128, "grad_norm": 5.224728801556268, "kl": 0.392578125, "learning_rate": 7.534569983136593e-07, "loss": 0.0004, "reward": 3.190197229385376, "reward_std": 0.1471584215760231, "rewards/final_reward": 1.115054770568458, "rewards/mask_iou_reward": 0.557527385284229, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1901973485946655, "rewards/thk_ans_format_reward": 1.0, "step": 731, "think_completion_length": 53.15625 }, { "clip_ratio": 0.0, "completion_length": 119.140625, "epoch": 1.2360876897133222, "grad_norm": 6.368329904579315, "kl": 0.408203125, "learning_rate": 7.531197301854975e-07, "loss": 0.0004, "reward": 2.9243475198745728, "reward_std": 0.15122611075639725, "rewards/final_reward": 0.876438929545167, "rewards/mask_iou_reward": 0.4382194647725835, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9243474006652832, "rewards/thk_ans_format_reward": 1.0, "step": 732, "think_completion_length": 51.53125 }, { "clip_ratio": 0.0, "completion_length": 120.359375, "epoch": 1.2377740303541316, "grad_norm": 7.636127726903107, "kl": 0.513671875, "learning_rate": 7.527824620573355e-07, "loss": 0.0005, "reward": 3.237205386161804, "reward_std": 0.4117434173822403, "rewards/final_reward": 1.32772648991989, "rewards/mask_iou_reward": 0.663863244959945, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.237205445766449, "rewards/thk_ans_format_reward": 1.0, "step": 733, "think_completion_length": 50.3125 }, { "clip_ratio": 0.0, "completion_length": 120.5625, "epoch": 1.239460370994941, "grad_norm": 4.156925317466383, "kl": 0.423828125, "learning_rate": 7.524451939291736e-07, "loss": 0.0004, "reward": 3.244380831718445, "reward_std": 0.2315196357667446, "rewards/final_reward": 1.3360523780132965, "rewards/mask_iou_reward": 0.6680261890066482, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2443808317184448, "rewards/thk_ans_format_reward": 1.0, "step": 734, "think_completion_length": 49.625 }, { "clip_ratio": 0.0, "completion_length": 125.609375, "epoch": 1.2411467116357504, "grad_norm": 8.29440186819507, "kl": 0.416015625, "learning_rate": 7.521079258010118e-07, "loss": 0.0004, "reward": 3.2429388761520386, "reward_std": 0.285244956612587, "rewards/final_reward": 1.1256669425154386, "rewards/mask_iou_reward": 0.5628334712577193, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2429389357566833, "rewards/thk_ans_format_reward": 1.0, "step": 735, "think_completion_length": 45.15625 }, { "clip_ratio": 0.0, "completion_length": 120.15625, "epoch": 1.2428330522765598, "grad_norm": 6.871379030140198, "kl": 0.439453125, "learning_rate": 7.517706576728498e-07, "loss": 0.0004, "reward": 2.851890802383423, "reward_std": 0.29892025887966156, "rewards/final_reward": 0.9705862343518611, "rewards/mask_iou_reward": 0.48529311717593054, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8518907129764557, "rewards/thk_ans_format_reward": 1.0, "step": 736, "think_completion_length": 46.75 }, { "clip_ratio": 0.0, "completion_length": 139.328125, "epoch": 1.2445193929173692, "grad_norm": 5.674713470841899, "kl": 0.4462890625, "learning_rate": 7.51433389544688e-07, "loss": 0.0004, "reward": 3.2446606159210205, "reward_std": 0.3162437919527292, "rewards/final_reward": 1.2440891629731174, "rewards/mask_iou_reward": 0.6220445814865587, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.244660496711731, "rewards/thk_ans_format_reward": 1.0, "step": 737, "think_completion_length": 48.5 }, { "clip_ratio": 0.0, "completion_length": 120.296875, "epoch": 1.2462057335581789, "grad_norm": 5.272017288869077, "kl": 0.4306640625, "learning_rate": 7.510961214165261e-07, "loss": 0.0004, "reward": 3.6630584001541138, "reward_std": 0.09967895410954952, "rewards/final_reward": 1.5553507476605395, "rewards/mask_iou_reward": 0.7776753738302697, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6630585193634033, "rewards/thk_ans_format_reward": 1.0, "step": 738, "think_completion_length": 48.15625 }, { "clip_ratio": 0.0, "completion_length": 141.484375, "epoch": 1.2478920741989883, "grad_norm": 5.12332748634796, "kl": 0.42578125, "learning_rate": 7.507588532883642e-07, "loss": 0.0004, "reward": 2.5206953287124634, "reward_std": 0.4001058042049408, "rewards/final_reward": 0.29060709585459255, "rewards/mask_iou_reward": 0.14530354792729627, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5206954479217529, "rewards/thk_ans_format_reward": 1.0, "step": 739, "think_completion_length": 53.625 }, { "clip_ratio": 0.0, "completion_length": 118.765625, "epoch": 1.2495784148397977, "grad_norm": 9.308612277526414, "kl": 0.5, "learning_rate": 7.504215851602024e-07, "loss": 0.0005, "reward": 3.1267716884613037, "reward_std": 0.14480041339993477, "rewards/final_reward": 1.2190345270701473, "rewards/mask_iou_reward": 0.6095172635350736, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1267716884613037, "rewards/thk_ans_format_reward": 1.0, "step": 740, "think_completion_length": 42.5625 }, { "clip_ratio": 0.0, "completion_length": 117.15625, "epoch": 1.2512647554806071, "grad_norm": 5.86388460967559, "kl": 0.453125, "learning_rate": 7.500843170320405e-07, "loss": 0.0005, "reward": 3.2177951335906982, "reward_std": 0.13633359596133232, "rewards/final_reward": 1.0138802716734372, "rewards/mask_iou_reward": 0.5069401358367186, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.217795193195343, "rewards/thk_ans_format_reward": 1.0, "step": 741, "think_completion_length": 48.40625 }, { "clip_ratio": 0.0, "completion_length": 118.296875, "epoch": 1.2529510961214165, "grad_norm": 7.697601454438822, "kl": 0.46484375, "learning_rate": 7.497470489038785e-07, "loss": 0.0005, "reward": 3.3932933807373047, "reward_std": 0.16536729037761688, "rewards/final_reward": 1.2362493299776005, "rewards/mask_iou_reward": 0.6181246649888003, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3932933807373047, "rewards/thk_ans_format_reward": 1.0, "step": 742, "think_completion_length": 49.4375 }, { "clip_ratio": 0.0, "completion_length": 133.03125, "epoch": 1.254637436762226, "grad_norm": 7.090758479273383, "kl": 0.3974609375, "learning_rate": 7.494097807757167e-07, "loss": 0.0004, "reward": 3.7598599195480347, "reward_std": 0.18115262687206268, "rewards/final_reward": 1.8922707127451575, "rewards/mask_iou_reward": 0.9461353563725787, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7598599791526794, "rewards/thk_ans_format_reward": 1.0, "step": 743, "think_completion_length": 59.1875 }, { "clip_ratio": 0.0, "completion_length": 169.765625, "epoch": 1.2563237774030354, "grad_norm": 11.742361206526029, "kl": 0.3642578125, "learning_rate": 7.490725126475548e-07, "loss": 0.0004, "reward": 3.0096986293792725, "reward_std": 0.3475239537656307, "rewards/final_reward": 0.841675245480936, "rewards/mask_iou_reward": 0.420837622740468, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.02532359957695, "rewards/thk_ans_format_reward": 1.0, "step": 744, "think_completion_length": 53.90625 }, { "clip_ratio": 0.0, "completion_length": 117.453125, "epoch": 1.258010118043845, "grad_norm": 3.906933246720136, "kl": 0.4375, "learning_rate": 7.487352445193928e-07, "loss": 0.0004, "reward": 3.715674877166748, "reward_std": 0.11384453624486923, "rewards/final_reward": 1.829332156329305, "rewards/mask_iou_reward": 0.9146660781646525, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7156748175621033, "rewards/thk_ans_format_reward": 1.0, "step": 745, "think_completion_length": 45.3125 }, { "clip_ratio": 0.0, "completion_length": 140.515625, "epoch": 1.2596964586846542, "grad_norm": 7.713701865661837, "kl": 0.400390625, "learning_rate": 7.48397976391231e-07, "loss": 0.0004, "reward": 3.502246141433716, "reward_std": 0.2445410154759884, "rewards/final_reward": 1.4493324108565608, "rewards/mask_iou_reward": 0.7246662054282804, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5022460222244263, "rewards/thk_ans_format_reward": 1.0, "step": 746, "think_completion_length": 39.8125 }, { "clip_ratio": 0.0, "completion_length": 122.5625, "epoch": 1.2613827993254638, "grad_norm": 6.49050141193276, "kl": 0.494140625, "learning_rate": 7.480607082630691e-07, "loss": 0.0005, "reward": 2.7671353816986084, "reward_std": 0.26506610214710236, "rewards/final_reward": 0.72433116542667, "rewards/mask_iou_reward": 0.362165582713335, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7671354562044144, "rewards/thk_ans_format_reward": 1.0, "step": 747, "think_completion_length": 51.40625 }, { "clip_ratio": 0.0, "completion_length": 118.28125, "epoch": 1.2630691399662732, "grad_norm": 5.370371100753026, "kl": 0.466796875, "learning_rate": 7.477234401349072e-07, "loss": 0.0005, "reward": 3.3842209577560425, "reward_std": 0.12122415006160736, "rewards/final_reward": 1.1605845540527804, "rewards/mask_iou_reward": 0.5802922770263902, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3842209577560425, "rewards/thk_ans_format_reward": 1.0, "step": 748, "think_completion_length": 46.1875 }, { "clip_ratio": 0.0, "completion_length": 124.796875, "epoch": 1.2647554806070826, "grad_norm": 7.250959167071798, "kl": 0.416015625, "learning_rate": 7.473861720067454e-07, "loss": 0.0004, "reward": 3.2378557920455933, "reward_std": 0.13333739154040813, "rewards/final_reward": 1.5390109431557328, "rewards/mask_iou_reward": 0.7695054715778664, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2378557920455933, "rewards/thk_ans_format_reward": 1.0, "step": 749, "think_completion_length": 48.875 }, { "clip_ratio": 0.0, "completion_length": 123.546875, "epoch": 1.266441821247892, "grad_norm": 13.773598736341574, "kl": 0.3857421875, "learning_rate": 7.470489038785834e-07, "loss": 0.0004, "reward": 2.9623697996139526, "reward_std": 0.670602947473526, "rewards/final_reward": 0.949467456779882, "rewards/mask_iou_reward": 0.474733728389941, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.962369829416275, "rewards/thk_ans_format_reward": 1.0, "step": 750, "think_completion_length": 60.6875 }, { "clip_ratio": 0.0, "completion_length": 122.578125, "epoch": 1.2681281618887015, "grad_norm": 43.82863090143971, "kl": 0.43359375, "learning_rate": 7.467116357504215e-07, "loss": 0.0004, "reward": 2.5886611938476562, "reward_std": 0.16707976162433624, "rewards/final_reward": 0.9058522578390267, "rewards/mask_iou_reward": 0.45292612891951334, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5886611640453339, "rewards/thk_ans_format_reward": 1.0, "step": 751, "think_completion_length": 48.34375 }, { "clip_ratio": 0.0, "completion_length": 121.96875, "epoch": 1.269814502529511, "grad_norm": 8.009493352613628, "kl": 0.4248046875, "learning_rate": 7.463743676222597e-07, "loss": 0.0004, "reward": 3.5888274908065796, "reward_std": 0.14439216628670692, "rewards/final_reward": 1.6300469074796642, "rewards/mask_iou_reward": 0.8150234537398321, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5888275504112244, "rewards/thk_ans_format_reward": 1.0, "step": 752, "think_completion_length": 55.28125 }, { "clip_ratio": 0.0, "completion_length": 120.09375, "epoch": 1.2715008431703203, "grad_norm": 13.0951374787943, "kl": 0.41796875, "learning_rate": 7.460370994940978e-07, "loss": 0.0004, "reward": 3.163659930229187, "reward_std": 0.2516661137342453, "rewards/final_reward": 0.6976526266310328, "rewards/mask_iou_reward": 0.3488263133155164, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1636599600315094, "rewards/thk_ans_format_reward": 1.0, "step": 753, "think_completion_length": 49.4375 }, { "clip_ratio": 0.0, "completion_length": 121.796875, "epoch": 1.27318718381113, "grad_norm": 7.955705248218445, "kl": 0.4375, "learning_rate": 7.456998313659358e-07, "loss": 0.0004, "reward": 2.9424456357955933, "reward_std": 0.24202048778533936, "rewards/final_reward": 1.233259705619929, "rewards/mask_iou_reward": 0.6166298528099645, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9580706655979156, "rewards/thk_ans_format_reward": 0.984375, "step": 754, "think_completion_length": 46.78125 }, { "clip_ratio": 0.0, "completion_length": 115.296875, "epoch": 1.2748735244519394, "grad_norm": 9.324204820166516, "kl": 0.6923828125, "learning_rate": 7.45362563237774e-07, "loss": 0.0007, "reward": 3.5302281379699707, "reward_std": 0.05718242051079869, "rewards/final_reward": 1.2451604551256392, "rewards/mask_iou_reward": 0.6225802275628196, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5302280187606812, "rewards/thk_ans_format_reward": 1.0, "step": 755, "think_completion_length": 45.40625 }, { "clip_ratio": 0.0, "completion_length": 121.625, "epoch": 1.2765598650927488, "grad_norm": 5.0297332010064695, "kl": 0.423828125, "learning_rate": 7.450252951096121e-07, "loss": 0.0004, "reward": 3.3245279788970947, "reward_std": 0.2012765109539032, "rewards/final_reward": 1.801916952998812, "rewards/mask_iou_reward": 0.900958476499406, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3245280385017395, "rewards/thk_ans_format_reward": 1.0, "step": 756, "think_completion_length": 52.5 }, { "clip_ratio": 0.0, "completion_length": 146.953125, "epoch": 1.2782462057335582, "grad_norm": 9.471865510151117, "kl": 0.40625, "learning_rate": 7.446880269814502e-07, "loss": 0.0004, "reward": 3.371519088745117, "reward_std": 0.1286439746618271, "rewards/final_reward": 0.9775423947239636, "rewards/mask_iou_reward": 0.4887711973619818, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3715189695358276, "rewards/thk_ans_format_reward": 1.0, "step": 757, "think_completion_length": 52.40625 }, { "clip_ratio": 0.0, "completion_length": 147.703125, "epoch": 1.2799325463743676, "grad_norm": 11.33907667372978, "kl": 0.455078125, "learning_rate": 7.443507588532883e-07, "loss": 0.0005, "reward": 3.3066617250442505, "reward_std": 0.1327102743089199, "rewards/final_reward": 0.9268396180312112, "rewards/mask_iou_reward": 0.4634198090156056, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3066617250442505, "rewards/thk_ans_format_reward": 1.0, "step": 758, "think_completion_length": 51.78125 }, { "clip_ratio": 0.0, "completion_length": 118.953125, "epoch": 1.281618887015177, "grad_norm": 22.710178443792344, "kl": 14.5859375, "learning_rate": 7.440134907251264e-07, "loss": 0.0146, "reward": 3.4230951070785522, "reward_std": 0.08178156521171331, "rewards/final_reward": 1.6240219194514065, "rewards/mask_iou_reward": 0.8120109597257033, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4230949878692627, "rewards/thk_ans_format_reward": 1.0, "step": 759, "think_completion_length": 47.875 }, { "clip_ratio": 0.0, "completion_length": 128.953125, "epoch": 1.2833052276559864, "grad_norm": 7.669965203607217, "kl": 0.4013671875, "learning_rate": 7.436762225969646e-07, "loss": 0.0004, "reward": 3.1363537311553955, "reward_std": 0.328082337975502, "rewards/final_reward": 0.8638903050486981, "rewards/mask_iou_reward": 0.43194515252434906, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.1519787907600403, "rewards/thk_ans_format_reward": 1.0, "step": 760, "think_completion_length": 54.21875 }, { "clip_ratio": 0.0, "completion_length": 151.453125, "epoch": 1.284991568296796, "grad_norm": 3.83970479145092, "kl": 0.396484375, "learning_rate": 7.433389544688027e-07, "loss": 0.0004, "reward": 2.9037578105926514, "reward_std": 0.2324867658317089, "rewards/final_reward": 0.9024056875965667, "rewards/mask_iou_reward": 0.45120284379828335, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.9350077509880066, "rewards/thk_ans_format_reward": 0.984375, "step": 761, "think_completion_length": 50.625 }, { "clip_ratio": 0.0, "completion_length": 137.6875, "epoch": 1.2866779089376055, "grad_norm": 4.220985191384563, "kl": 0.421875, "learning_rate": 7.430016863406408e-07, "loss": 0.0004, "reward": 3.437883973121643, "reward_std": 0.4040543884038925, "rewards/final_reward": 1.551179383404998, "rewards/mask_iou_reward": 0.775589691702499, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.4691339135169983, "rewards/thk_ans_format_reward": 0.984375, "step": 762, "think_completion_length": 47.8125 }, { "clip_ratio": 0.0, "completion_length": 119.59375, "epoch": 1.2883642495784149, "grad_norm": 4.057618415959299, "kl": 0.44140625, "learning_rate": 7.42664418212479e-07, "loss": 0.0004, "reward": 3.699872136116028, "reward_std": 0.17768453806638718, "rewards/final_reward": 1.8048597321254096, "rewards/mask_iou_reward": 0.9024298660627048, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.699872076511383, "rewards/thk_ans_format_reward": 1.0, "step": 763, "think_completion_length": 46.28125 }, { "clip_ratio": 0.0, "completion_length": 124.203125, "epoch": 1.2900505902192243, "grad_norm": 7.307889009036356, "kl": 0.40234375, "learning_rate": 7.42327150084317e-07, "loss": 0.0004, "reward": 2.904768705368042, "reward_std": 0.4192696511745453, "rewards/final_reward": 0.9978510225487277, "rewards/mask_iou_reward": 0.49892551127436385, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.904768705368042, "rewards/thk_ans_format_reward": 1.0, "step": 764, "think_completion_length": 57.34375 }, { "clip_ratio": 0.0, "completion_length": 149.390625, "epoch": 1.2917369308600337, "grad_norm": 3.8367155882101893, "kl": 0.3955078125, "learning_rate": 7.419898819561551e-07, "loss": 0.0004, "reward": 2.8194090127944946, "reward_std": 0.10093265399336815, "rewards/final_reward": 1.1600728340864759, "rewards/mask_iou_reward": 0.5800364170432379, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8194091022014618, "rewards/thk_ans_format_reward": 1.0, "step": 765, "think_completion_length": 52.96875 }, { "clip_ratio": 0.0, "completion_length": 116.65625, "epoch": 1.2934232715008431, "grad_norm": 57.108509817729846, "kl": 0.482421875, "learning_rate": 7.416526138279933e-07, "loss": 0.0005, "reward": 3.300451397895813, "reward_std": 0.10350893437862396, "rewards/final_reward": 1.2288283409460157, "rewards/mask_iou_reward": 0.6144141704730078, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.300451636314392, "rewards/thk_ans_format_reward": 1.0, "step": 766, "think_completion_length": 45.90625 }, { "clip_ratio": 0.0, "completion_length": 123.0625, "epoch": 1.2951096121416525, "grad_norm": 25.825394262271047, "kl": 0.4189453125, "learning_rate": 7.413153456998313e-07, "loss": 0.0004, "reward": 2.7487651109695435, "reward_std": 0.17696194536983967, "rewards/final_reward": 0.6962242875658127, "rewards/mask_iou_reward": 0.34811214378290634, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7487652003765106, "rewards/thk_ans_format_reward": 1.0, "step": 767, "think_completion_length": 50.9375 }, { "clip_ratio": 0.0, "completion_length": 120.9375, "epoch": 1.2967959527824622, "grad_norm": 7.441897134927553, "kl": 0.7509765625, "learning_rate": 7.409780775716694e-07, "loss": 0.0008, "reward": 3.0058658123016357, "reward_std": 0.20971830189228058, "rewards/final_reward": 0.6484055966150901, "rewards/mask_iou_reward": 0.32420279830754506, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0058658123016357, "rewards/thk_ans_format_reward": 1.0, "step": 768, "think_completion_length": 44.96875 }, { "clip_ratio": 0.0, "completion_length": 135.859375, "epoch": 1.2984822934232714, "grad_norm": 11.188822729031108, "kl": 0.388671875, "learning_rate": 7.406408094435076e-07, "loss": 0.0004, "reward": 2.955198287963867, "reward_std": 0.25020837038755417, "rewards/final_reward": 1.4167753304898385, "rewards/mask_iou_reward": 0.7083876652449193, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9551981687545776, "rewards/thk_ans_format_reward": 1.0, "step": 769, "think_completion_length": 48.40625 }, { "clip_ratio": 0.0, "completion_length": 138.578125, "epoch": 1.300168634064081, "grad_norm": 6.824503029956907, "kl": 0.3935546875, "learning_rate": 7.403035413153457e-07, "loss": 0.0004, "reward": 2.642868399620056, "reward_std": 0.1783033236861229, "rewards/final_reward": 0.2176228862671831, "rewards/mask_iou_reward": 0.10881144313359155, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6428685784339905, "rewards/thk_ans_format_reward": 1.0, "step": 770, "think_completion_length": 49.46875 }, { "clip_ratio": 0.0, "completion_length": 132.625, "epoch": 1.3018549747048904, "grad_norm": 20.721620825901006, "kl": 0.3740234375, "learning_rate": 7.399662731871838e-07, "loss": 0.0004, "reward": 3.0567421913146973, "reward_std": 0.07508281245827675, "rewards/final_reward": 1.1172808054995134, "rewards/mask_iou_reward": 0.5586404027497567, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0567420572042465, "rewards/thk_ans_format_reward": 1.0, "step": 771, "think_completion_length": 41.78125 }, { "clip_ratio": 0.0, "completion_length": 134.0625, "epoch": 1.3035413153456998, "grad_norm": 25.55148365834708, "kl": 0.376953125, "learning_rate": 7.39629005059022e-07, "loss": 0.0004, "reward": 2.609902501106262, "reward_std": 0.3896046429872513, "rewards/final_reward": 1.0064766722582872, "rewards/mask_iou_reward": 0.5032383361291436, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6099025905132294, "rewards/thk_ans_format_reward": 1.0, "step": 772, "think_completion_length": 54.125 }, { "clip_ratio": 0.0, "completion_length": 138.9375, "epoch": 1.3052276559865092, "grad_norm": 4.538825899958136, "kl": 1.71484375, "learning_rate": 7.3929173693086e-07, "loss": 0.0017, "reward": 3.3087058067321777, "reward_std": 0.14748084964230657, "rewards/final_reward": 1.1135952740579675, "rewards/mask_iou_reward": 0.5567976370289838, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.308705747127533, "rewards/thk_ans_format_reward": 1.0, "step": 773, "think_completion_length": 47.1875 }, { "clip_ratio": 0.0, "completion_length": 123.03125, "epoch": 1.3069139966273187, "grad_norm": 5.948882090803801, "kl": 0.419921875, "learning_rate": 7.389544688026981e-07, "loss": 0.0004, "reward": 3.3070883750915527, "reward_std": 0.16637492179870605, "rewards/final_reward": 1.4802129145206084, "rewards/mask_iou_reward": 0.7401064572603042, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3070884346961975, "rewards/thk_ans_format_reward": 1.0, "step": 774, "think_completion_length": 47.6875 }, { "clip_ratio": 0.0, "completion_length": 120.5625, "epoch": 1.3086003372681283, "grad_norm": 11.679887672509015, "kl": 0.4072265625, "learning_rate": 7.386172006745362e-07, "loss": 0.0004, "reward": 3.3442749977111816, "reward_std": 0.19580984860658646, "rewards/final_reward": 1.843980964493185, "rewards/mask_iou_reward": 0.9219904822465925, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3442749977111816, "rewards/thk_ans_format_reward": 1.0, "step": 775, "think_completion_length": 49.40625 }, { "clip_ratio": 0.0, "completion_length": 114.6875, "epoch": 1.3102866779089375, "grad_norm": 7.511563914848423, "kl": 0.47265625, "learning_rate": 7.382799325463743e-07, "loss": 0.0005, "reward": 3.577596068382263, "reward_std": 0.11906663700938225, "rewards/final_reward": 1.8184439123146532, "rewards/mask_iou_reward": 0.9092219561573266, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.577596127986908, "rewards/thk_ans_format_reward": 1.0, "step": 776, "think_completion_length": 43.90625 }, { "clip_ratio": 0.0, "completion_length": 141.828125, "epoch": 1.3119730185497471, "grad_norm": 6.771204942577976, "kl": 0.390625, "learning_rate": 7.379426644182124e-07, "loss": 0.0004, "reward": 3.6653069257736206, "reward_std": 0.017012731172144413, "rewards/final_reward": 1.5131021459032592, "rewards/mask_iou_reward": 0.7565510729516296, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6653068661689758, "rewards/thk_ans_format_reward": 1.0, "step": 777, "think_completion_length": 43.40625 }, { "clip_ratio": 0.0, "completion_length": 127.921875, "epoch": 1.3136593591905565, "grad_norm": 11.832163253877466, "kl": 0.375, "learning_rate": 7.376053962900506e-07, "loss": 0.0004, "reward": 2.6942551136016846, "reward_std": 0.13270641304552555, "rewards/final_reward": 1.1430431598691082, "rewards/mask_iou_reward": 0.5715215799345541, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6942551732063293, "rewards/thk_ans_format_reward": 1.0, "step": 778, "think_completion_length": 51.65625 }, { "clip_ratio": 0.0, "completion_length": 129.359375, "epoch": 1.315345699831366, "grad_norm": 4.538483574655835, "kl": 0.388671875, "learning_rate": 7.372681281618887e-07, "loss": 0.0004, "reward": 2.9698901176452637, "reward_std": 0.35718169808387756, "rewards/final_reward": 1.0378266910704492, "rewards/mask_iou_reward": 0.5189133455352246, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9698899984359741, "rewards/thk_ans_format_reward": 1.0, "step": 779, "think_completion_length": 47.75 }, { "clip_ratio": 0.0, "completion_length": 144.828125, "epoch": 1.3170320404721754, "grad_norm": 5.312250952721113, "kl": 0.4326171875, "learning_rate": 7.369308600337268e-07, "loss": 0.0004, "reward": 2.9204649925231934, "reward_std": 0.2078213393688202, "rewards/final_reward": 0.902995751127586, "rewards/mask_iou_reward": 0.451497875563793, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9204651415348053, "rewards/thk_ans_format_reward": 1.0, "step": 780, "think_completion_length": 48.59375 }, { "clip_ratio": 0.0, "completion_length": 127.703125, "epoch": 1.3187183811129848, "grad_norm": 7.112756534834644, "kl": 0.486328125, "learning_rate": 7.365935919055649e-07, "loss": 0.0005, "reward": 3.3957865238189697, "reward_std": 0.29513096809387207, "rewards/final_reward": 1.289510311648049, "rewards/mask_iou_reward": 0.6447551558240245, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4114114046096802, "rewards/thk_ans_format_reward": 0.984375, "step": 781, "think_completion_length": 51.125 }, { "clip_ratio": 0.0, "completion_length": 132.046875, "epoch": 1.3204047217537942, "grad_norm": 5.092965912570209, "kl": 0.3681640625, "learning_rate": 7.36256323777403e-07, "loss": 0.0004, "reward": 2.766505479812622, "reward_std": 0.17994603142142296, "rewards/final_reward": 1.114762105762541, "rewards/mask_iou_reward": 0.5573810528812705, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7665055394172668, "rewards/thk_ans_format_reward": 1.0, "step": 782, "think_completion_length": 48.90625 }, { "clip_ratio": 0.0, "completion_length": 121.9375, "epoch": 1.3220910623946036, "grad_norm": 15.032888120622165, "kl": 0.4228515625, "learning_rate": 7.35919055649241e-07, "loss": 0.0004, "reward": 2.8511098623275757, "reward_std": 0.2713186927139759, "rewards/final_reward": 0.51069635454804, "rewards/mask_iou_reward": 0.25534817727402, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8511098623275757, "rewards/thk_ans_format_reward": 1.0, "step": 783, "think_completion_length": 43.6875 }, { "clip_ratio": 0.0, "completion_length": 155.5, "epoch": 1.3237774030354132, "grad_norm": 10.726585243024278, "kl": 0.388671875, "learning_rate": 7.355817875210792e-07, "loss": 0.0004, "reward": 3.1994192600250244, "reward_std": 0.07405038690194488, "rewards/final_reward": 1.661394365132652, "rewards/mask_iou_reward": 0.830697182566326, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1994193196296692, "rewards/thk_ans_format_reward": 1.0, "step": 784, "think_completion_length": 42.6875 }, { "clip_ratio": 0.0, "completion_length": 121.921875, "epoch": 1.3254637436762227, "grad_norm": 6.017124048186462, "kl": 1.078125, "learning_rate": 7.352445193929173e-07, "loss": 0.0011, "reward": 3.557616353034973, "reward_std": 0.17845550179481506, "rewards/final_reward": 1.5886097221131004, "rewards/mask_iou_reward": 0.7943048610565502, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5576163530349731, "rewards/thk_ans_format_reward": 1.0, "step": 785, "think_completion_length": 46.4375 }, { "clip_ratio": 0.0, "completion_length": 116.140625, "epoch": 1.327150084317032, "grad_norm": 9.94888675204662, "kl": 0.48046875, "learning_rate": 7.349072512647555e-07, "loss": 0.0005, "reward": 3.732688069343567, "reward_std": 0.053104594349861145, "rewards/final_reward": 1.82699322983769, "rewards/mask_iou_reward": 0.913496614918845, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7326880097389221, "rewards/thk_ans_format_reward": 1.0, "step": 786, "think_completion_length": 48.75 }, { "clip_ratio": 0.0, "completion_length": 117.96875, "epoch": 1.3288364249578415, "grad_norm": 5.103127094843979, "kl": 0.4130859375, "learning_rate": 7.345699831365936e-07, "loss": 0.0004, "reward": 3.539349317550659, "reward_std": 0.08835931494832039, "rewards/final_reward": 1.3966989172543376, "rewards/mask_iou_reward": 0.6983494586271688, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5393492579460144, "rewards/thk_ans_format_reward": 1.0, "step": 787, "think_completion_length": 45.9375 }, { "clip_ratio": 0.0, "completion_length": 117.53125, "epoch": 1.330522765598651, "grad_norm": 4.8496798516590705, "kl": 0.4423828125, "learning_rate": 7.342327150084317e-07, "loss": 0.0004, "reward": 2.9521323442459106, "reward_std": 0.26333199441432953, "rewards/final_reward": 0.4716661765206818, "rewards/mask_iou_reward": 0.2358330882603409, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9521324634552002, "rewards/thk_ans_format_reward": 1.0, "step": 788, "think_completion_length": 50.0 }, { "clip_ratio": 0.0, "completion_length": 116.28125, "epoch": 1.3322091062394603, "grad_norm": 19.90609841032805, "kl": 0.4208984375, "learning_rate": 7.338954468802699e-07, "loss": 0.0004, "reward": 2.9552639722824097, "reward_std": 0.030708997743204236, "rewards/final_reward": 0.9783795007841563, "rewards/mask_iou_reward": 0.4891897503920781, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9552639275789261, "rewards/thk_ans_format_reward": 1.0, "step": 789, "think_completion_length": 45.53125 }, { "clip_ratio": 0.0, "completion_length": 132.640625, "epoch": 1.3338954468802697, "grad_norm": 7.037305808313017, "kl": 0.431640625, "learning_rate": 7.335581787521079e-07, "loss": 0.0004, "reward": 3.3982867002487183, "reward_std": 0.17157932370901108, "rewards/final_reward": 1.3373675370522078, "rewards/mask_iou_reward": 0.6686837685261039, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3982867002487183, "rewards/thk_ans_format_reward": 1.0, "step": 790, "think_completion_length": 49.09375 }, { "clip_ratio": 0.0, "completion_length": 115.515625, "epoch": 1.3355817875210794, "grad_norm": 14.637715513326066, "kl": 0.4501953125, "learning_rate": 7.332209106239459e-07, "loss": 0.0005, "reward": 3.3809139728546143, "reward_std": 0.3167175129055977, "rewards/final_reward": 1.8598589636448288, "rewards/mask_iou_reward": 0.9299294818224144, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3809138536453247, "rewards/thk_ans_format_reward": 1.0, "step": 791, "think_completion_length": 48.1875 }, { "clip_ratio": 0.0, "completion_length": 115.328125, "epoch": 1.3372681281618888, "grad_norm": 63.794113836109965, "kl": 0.4267578125, "learning_rate": 7.328836424957841e-07, "loss": 0.0004, "reward": 3.1712608337402344, "reward_std": 0.029415050521492958, "rewards/final_reward": 0.8930833080697248, "rewards/mask_iou_reward": 0.4465416540348624, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1712608337402344, "rewards/thk_ans_format_reward": 1.0, "step": 792, "think_completion_length": 49.40625 }, { "clip_ratio": 0.0, "completion_length": 120.15625, "epoch": 1.3389544688026982, "grad_norm": 7.031832709390953, "kl": 0.705078125, "learning_rate": 7.325463743676222e-07, "loss": 0.0007, "reward": 3.364739775657654, "reward_std": 0.13826466910541058, "rewards/final_reward": 1.8511947276274574, "rewards/mask_iou_reward": 0.9255973638137287, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3647398352622986, "rewards/thk_ans_format_reward": 1.0, "step": 793, "think_completion_length": 54.25 }, { "clip_ratio": 0.0, "completion_length": 119.390625, "epoch": 1.3406408094435076, "grad_norm": 4.754955191343839, "kl": 0.4521484375, "learning_rate": 7.322091062394603e-07, "loss": 0.0005, "reward": 3.746224522590637, "reward_std": 0.06749487156048417, "rewards/final_reward": 1.532440120591791, "rewards/mask_iou_reward": 0.7662200602958955, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7462245225906372, "rewards/thk_ans_format_reward": 1.0, "step": 794, "think_completion_length": 49.53125 }, { "clip_ratio": 0.0, "completion_length": 116.421875, "epoch": 1.342327150084317, "grad_norm": 6.023167821978929, "kl": 0.4140625, "learning_rate": 7.318718381112985e-07, "loss": 0.0004, "reward": 3.211634874343872, "reward_std": 0.1754566803574562, "rewards/final_reward": 1.3337678355455997, "rewards/mask_iou_reward": 0.6668839177727999, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2116347551345825, "rewards/thk_ans_format_reward": 1.0, "step": 795, "think_completion_length": 46.8125 }, { "clip_ratio": 0.0, "completion_length": 141.421875, "epoch": 1.3440134907251264, "grad_norm": 8.1196912130005, "kl": 0.41015625, "learning_rate": 7.315345699831366e-07, "loss": 0.0004, "reward": 3.1976126432418823, "reward_std": 0.3106941059231758, "rewards/final_reward": 0.861060513926105, "rewards/mask_iou_reward": 0.4305302569630525, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.197612702846527, "rewards/thk_ans_format_reward": 1.0, "step": 796, "think_completion_length": 51.09375 }, { "clip_ratio": 0.0, "completion_length": 122.8125, "epoch": 1.3456998313659359, "grad_norm": 21.674449037201075, "kl": 0.455078125, "learning_rate": 7.311973018549747e-07, "loss": 0.0005, "reward": 3.1202865839004517, "reward_std": 0.21427929773926735, "rewards/final_reward": 1.0092933340170596, "rewards/mask_iou_reward": 0.5046466670085298, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1202866435050964, "rewards/thk_ans_format_reward": 1.0, "step": 797, "think_completion_length": 50.09375 }, { "clip_ratio": 0.0, "completion_length": 125.4375, "epoch": 1.3473861720067455, "grad_norm": 9.802205092093088, "kl": 0.421875, "learning_rate": 7.308600337268129e-07, "loss": 0.0004, "reward": 2.962410807609558, "reward_std": 0.2575262784957886, "rewards/final_reward": 0.8481624548206849, "rewards/mask_iou_reward": 0.42408122741034243, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9624108374118805, "rewards/thk_ans_format_reward": 1.0, "step": 798, "think_completion_length": 59.21875 }, { "clip_ratio": 0.0, "completion_length": 124.5, "epoch": 1.3490725126475547, "grad_norm": 8.383808482465419, "kl": 0.4482421875, "learning_rate": 7.305227655986509e-07, "loss": 0.0004, "reward": 3.0775705575942993, "reward_std": 0.25188253819942474, "rewards/final_reward": 1.043739117647533, "rewards/mask_iou_reward": 0.5218695588237665, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.0931956470012665, "rewards/thk_ans_format_reward": 1.0, "step": 799, "think_completion_length": 51.34375 }, { "clip_ratio": 0.0, "completion_length": 136.9375, "epoch": 1.3507588532883643, "grad_norm": 9.518272268526802, "kl": 0.3837890625, "learning_rate": 7.301854974704889e-07, "loss": 0.0004, "reward": 3.0064759254455566, "reward_std": 0.4474050849676132, "rewards/final_reward": 0.5596855725569843, "rewards/mask_iou_reward": 0.27984278627849213, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.006475806236267, "rewards/thk_ans_format_reward": 1.0, "step": 800, "think_completion_length": 54.0625 }, { "clip_ratio": 0.0, "completion_length": 119.65625, "epoch": 1.3524451939291737, "grad_norm": 6.1880658285669785, "kl": 0.4833984375, "learning_rate": 7.298482293423271e-07, "loss": 0.0005, "reward": 3.1219156980514526, "reward_std": 0.30353303998708725, "rewards/final_reward": 1.1040391526657047, "rewards/mask_iou_reward": 0.5520195763328524, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.121915653347969, "rewards/thk_ans_format_reward": 1.0, "step": 801, "think_completion_length": 53.9375 }, { "clip_ratio": 0.0, "completion_length": 127.546875, "epoch": 1.3541315345699831, "grad_norm": 9.625310119019451, "kl": 0.4765625, "learning_rate": 7.295109612141652e-07, "loss": 0.0005, "reward": 3.4418479204177856, "reward_std": 0.22188640385866165, "rewards/final_reward": 1.8515527330361707, "rewards/mask_iou_reward": 0.9257763665180854, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4418478608131409, "rewards/thk_ans_format_reward": 1.0, "step": 802, "think_completion_length": 47.96875 }, { "clip_ratio": 0.0, "completion_length": 122.28125, "epoch": 1.3558178752107926, "grad_norm": 8.513931375294348, "kl": 0.4482421875, "learning_rate": 7.291736930860033e-07, "loss": 0.0004, "reward": 3.2668780088424683, "reward_std": 0.17149719037115574, "rewards/final_reward": 1.26499462085288, "rewards/mask_iou_reward": 0.63249731042644, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.266878068447113, "rewards/thk_ans_format_reward": 1.0, "step": 803, "think_completion_length": 53.375 }, { "clip_ratio": 0.0, "completion_length": 141.09375, "epoch": 1.357504215851602, "grad_norm": 10.137968549400595, "kl": 0.4609375, "learning_rate": 7.288364249578415e-07, "loss": 0.0005, "reward": 3.2986963987350464, "reward_std": 0.28296051174402237, "rewards/final_reward": 1.6626159487723857, "rewards/mask_iou_reward": 0.8313079743861929, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.3143212795257568, "rewards/thk_ans_format_reward": 1.0, "step": 804, "think_completion_length": 50.25 }, { "clip_ratio": 0.0, "completion_length": 116.546875, "epoch": 1.3591905564924116, "grad_norm": 6.621797528995769, "kl": 0.4189453125, "learning_rate": 7.284991568296796e-07, "loss": 0.0004, "reward": 3.6139891147613525, "reward_std": 0.3693799674510956, "rewards/final_reward": 1.80038448323186, "rewards/mask_iou_reward": 0.90019224161593, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6139891147613525, "rewards/thk_ans_format_reward": 1.0, "step": 805, "think_completion_length": 56.25 }, { "clip_ratio": 0.0, "completion_length": 142.671875, "epoch": 1.3608768971332208, "grad_norm": 42.489537876572705, "kl": 0.439453125, "learning_rate": 7.281618887015177e-07, "loss": 0.0005, "reward": 3.02545964717865, "reward_std": 0.10977564379572868, "rewards/final_reward": 1.0804780233200864, "rewards/mask_iou_reward": 0.5402390116600432, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.04108464717865, "rewards/thk_ans_format_reward": 1.0, "step": 806, "think_completion_length": 56.0625 }, { "clip_ratio": 0.0, "completion_length": 126.75, "epoch": 1.3625632377740304, "grad_norm": 13.881123412247668, "kl": 0.4365234375, "learning_rate": 7.278246205733559e-07, "loss": 0.0004, "reward": 3.5061166286468506, "reward_std": 0.2770638167858124, "rewards/final_reward": 1.5442817771756299, "rewards/mask_iou_reward": 0.7721408885878149, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5061166882514954, "rewards/thk_ans_format_reward": 1.0, "step": 807, "think_completion_length": 63.34375 }, { "clip_ratio": 0.0, "completion_length": 123.53125, "epoch": 1.3642495784148398, "grad_norm": 44.01742029034989, "kl": 0.455078125, "learning_rate": 7.274873524451938e-07, "loss": 0.0005, "reward": 3.0958797931671143, "reward_std": 0.06491614319384098, "rewards/final_reward": 1.5420486778303586, "rewards/mask_iou_reward": 0.7710243389151793, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.095879852771759, "rewards/thk_ans_format_reward": 1.0, "step": 808, "think_completion_length": 54.0625 }, { "clip_ratio": 0.0, "completion_length": 130.375, "epoch": 1.3659359190556493, "grad_norm": 7.506006853834921, "kl": 0.4501953125, "learning_rate": 7.271500843170319e-07, "loss": 0.0005, "reward": 3.369332194328308, "reward_std": 0.3005400598049164, "rewards/final_reward": 1.4333333109147426, "rewards/mask_iou_reward": 0.7166666554573713, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3693323731422424, "rewards/thk_ans_format_reward": 1.0, "step": 809, "think_completion_length": 50.96875 }, { "clip_ratio": 0.0, "completion_length": 125.09375, "epoch": 1.3676222596964587, "grad_norm": 5.096555219506506, "kl": 0.4072265625, "learning_rate": 7.268128161888701e-07, "loss": 0.0004, "reward": 3.2549372911453247, "reward_std": 0.11136971414089203, "rewards/final_reward": 1.3701246547199526, "rewards/mask_iou_reward": 0.6850623273599763, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.25493723154068, "rewards/thk_ans_format_reward": 1.0, "step": 810, "think_completion_length": 52.125 }, { "clip_ratio": 0.0, "completion_length": 122.34375, "epoch": 1.369308600337268, "grad_norm": 4.165347153065698, "kl": 0.4443359375, "learning_rate": 7.264755480607082e-07, "loss": 0.0004, "reward": 3.306629776954651, "reward_std": 0.20777817629277706, "rewards/final_reward": 1.4866853816181873, "rewards/mask_iou_reward": 0.7433426908090937, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.306629717350006, "rewards/thk_ans_format_reward": 1.0, "step": 811, "think_completion_length": 51.65625 }, { "clip_ratio": 0.0, "completion_length": 142.4375, "epoch": 1.3709949409780775, "grad_norm": 16.28929272778552, "kl": 0.427734375, "learning_rate": 7.261382799325464e-07, "loss": 0.0004, "reward": 3.4713690280914307, "reward_std": 0.3839820772409439, "rewards/final_reward": 1.678402783229804, "rewards/mask_iou_reward": 0.839201391614902, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 1.5026191473007202, "rewards/thk_ans_format_reward": 1.0, "step": 812, "think_completion_length": 54.84375 }, { "clip_ratio": 0.0, "completion_length": 110.15625, "epoch": 1.372681281618887, "grad_norm": 7.969905817192309, "kl": 0.392578125, "learning_rate": 7.258010118043845e-07, "loss": 0.0004, "reward": 3.7847758531570435, "reward_std": 0.1786189409904182, "rewards/final_reward": 1.7245267628009748, "rewards/mask_iou_reward": 0.8622633814004874, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.784775972366333, "rewards/thk_ans_format_reward": 1.0, "step": 813, "think_completion_length": 50.0 }, { "clip_ratio": 0.0, "completion_length": 142.515625, "epoch": 1.3743676222596966, "grad_norm": 4.368367320540565, "kl": 0.4111328125, "learning_rate": 7.254637436762226e-07, "loss": 0.0004, "reward": 3.223156690597534, "reward_std": 0.10445614764466882, "rewards/final_reward": 0.6421020966983467, "rewards/mask_iou_reward": 0.32105104834917336, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.2387816905975342, "rewards/thk_ans_format_reward": 1.0, "step": 814, "think_completion_length": 58.0625 }, { "clip_ratio": 0.0, "completion_length": 145.359375, "epoch": 1.376053962900506, "grad_norm": 5.653449443697027, "kl": 0.455078125, "learning_rate": 7.251264755480608e-07, "loss": 0.0005, "reward": 3.348850727081299, "reward_std": 0.04212654661387205, "rewards/final_reward": 1.0661254905194237, "rewards/mask_iou_reward": 0.5330627452597119, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3488507866859436, "rewards/thk_ans_format_reward": 1.0, "step": 815, "think_completion_length": 52.28125 }, { "clip_ratio": 0.0, "completion_length": 126.921875, "epoch": 1.3777403035413154, "grad_norm": 6.96340082616904, "kl": 0.4375, "learning_rate": 7.247892074198987e-07, "loss": 0.0004, "reward": 3.5712579488754272, "reward_std": 0.2067468911409378, "rewards/final_reward": 1.284450105073941, "rewards/mask_iou_reward": 0.6422250525369705, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5712580680847168, "rewards/thk_ans_format_reward": 1.0, "step": 816, "think_completion_length": 58.46875 }, { "clip_ratio": 0.0, "completion_length": 131.359375, "epoch": 1.3794266441821248, "grad_norm": 5.219801172976387, "kl": 0.396484375, "learning_rate": 7.244519392917368e-07, "loss": 0.0004, "reward": 3.140303611755371, "reward_std": 0.13667535781860352, "rewards/final_reward": 0.8110110717477675, "rewards/mask_iou_reward": 0.40550553587388377, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.140303611755371, "rewards/thk_ans_format_reward": 1.0, "step": 817, "think_completion_length": 60.03125 }, { "clip_ratio": 0.0, "completion_length": 142.609375, "epoch": 1.3811129848229342, "grad_norm": 4.155579350217279, "kl": 0.3583984375, "learning_rate": 7.24114671163575e-07, "loss": 0.0004, "reward": 2.7823375463485718, "reward_std": 0.2525832876563072, "rewards/final_reward": 0.6301977560473451, "rewards/mask_iou_reward": 0.31509887802367254, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.782337486743927, "rewards/thk_ans_format_reward": 1.0, "step": 818, "think_completion_length": 53.34375 }, { "clip_ratio": 0.0, "completion_length": 120.625, "epoch": 1.3827993254637436, "grad_norm": 16.17353535009975, "kl": 0.4296875, "learning_rate": 7.237774030354131e-07, "loss": 0.0004, "reward": 3.2859296798706055, "reward_std": 0.0780464205890894, "rewards/final_reward": 1.156685700621454, "rewards/mask_iou_reward": 0.578342850310727, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.285929560661316, "rewards/thk_ans_format_reward": 1.0, "step": 819, "think_completion_length": 53.40625 }, { "clip_ratio": 0.0, "completion_length": 133.90625, "epoch": 1.384485666104553, "grad_norm": 6.598230146187883, "kl": 0.408203125, "learning_rate": 7.234401349072512e-07, "loss": 0.0004, "reward": 3.308698892593384, "reward_std": 0.14432849548757076, "rewards/final_reward": 0.821684403678177, "rewards/mask_iou_reward": 0.4108422018390885, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3086988925933838, "rewards/thk_ans_format_reward": 1.0, "step": 820, "think_completion_length": 55.125 }, { "clip_ratio": 0.0, "completion_length": 128.84375, "epoch": 1.3861720067453627, "grad_norm": 7.929231773635772, "kl": 0.419921875, "learning_rate": 7.231028667790894e-07, "loss": 0.0004, "reward": 3.2562429904937744, "reward_std": 0.18748314306139946, "rewards/final_reward": 1.221749844106246, "rewards/mask_iou_reward": 0.610874922053123, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2562429904937744, "rewards/thk_ans_format_reward": 1.0, "step": 821, "think_completion_length": 53.5 }, { "clip_ratio": 0.0, "completion_length": 124.8125, "epoch": 1.387858347386172, "grad_norm": 16.674172381983915, "kl": 0.455078125, "learning_rate": 7.227655986509275e-07, "loss": 0.0005, "reward": 3.353366732597351, "reward_std": 0.20042579993605614, "rewards/final_reward": 0.8192778731739727, "rewards/mask_iou_reward": 0.40963893658698636, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3533667922019958, "rewards/thk_ans_format_reward": 1.0, "step": 822, "think_completion_length": 60.96875 }, { "clip_ratio": 0.0, "completion_length": 124.0, "epoch": 1.3895446880269815, "grad_norm": 13.233620326708719, "kl": 0.408203125, "learning_rate": 7.224283305227656e-07, "loss": 0.0004, "reward": 3.7568464279174805, "reward_std": 0.035045892000198364, "rewards/final_reward": 1.9271590606246267, "rewards/mask_iou_reward": 0.9635795303123134, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7568464279174805, "rewards/thk_ans_format_reward": 1.0, "step": 823, "think_completion_length": 58.28125 }, { "clip_ratio": 0.0, "completion_length": 172.140625, "epoch": 1.391231028667791, "grad_norm": 5.457368929175906, "kl": 0.365234375, "learning_rate": 7.220910623946038e-07, "loss": 0.0004, "reward": 3.212060332298279, "reward_std": 0.2671176493167877, "rewards/final_reward": 1.6695634439718066, "rewards/mask_iou_reward": 0.8347817219859033, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.2276853322982788, "rewards/thk_ans_format_reward": 1.0, "step": 824, "think_completion_length": 54.875 }, { "clip_ratio": 0.0, "completion_length": 126.6875, "epoch": 1.3929173693086003, "grad_norm": 9.735584218214859, "kl": 0.474609375, "learning_rate": 7.217537942664417e-07, "loss": 0.0005, "reward": 3.3629668951034546, "reward_std": 0.20313755422830582, "rewards/final_reward": 1.441508973487453, "rewards/mask_iou_reward": 0.7207544867437266, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3629668951034546, "rewards/thk_ans_format_reward": 1.0, "step": 825, "think_completion_length": 55.875 }, { "clip_ratio": 0.0, "completion_length": 118.21875, "epoch": 1.3946037099494097, "grad_norm": 8.522991047543638, "kl": 0.4296875, "learning_rate": 7.214165261382798e-07, "loss": 0.0004, "reward": 3.4234044551849365, "reward_std": 0.40352555364370346, "rewards/final_reward": 1.0577974036356383, "rewards/mask_iou_reward": 0.5288987018178192, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4234043955802917, "rewards/thk_ans_format_reward": 1.0, "step": 826, "think_completion_length": 55.65625 }, { "clip_ratio": 0.0, "completion_length": 124.25, "epoch": 1.3962900505902192, "grad_norm": 15.899916744193234, "kl": 0.4189453125, "learning_rate": 7.21079258010118e-07, "loss": 0.0004, "reward": 3.3997583389282227, "reward_std": 0.10933668166399002, "rewards/final_reward": 1.0764809433615938, "rewards/mask_iou_reward": 0.5382404716807969, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3997583985328674, "rewards/thk_ans_format_reward": 1.0, "step": 827, "think_completion_length": 49.71875 }, { "clip_ratio": 0.0, "completion_length": 131.734375, "epoch": 1.3979763912310288, "grad_norm": 5.387528232664865, "kl": 0.4287109375, "learning_rate": 7.207419898819561e-07, "loss": 0.0004, "reward": 3.3618130683898926, "reward_std": 0.2051006779074669, "rewards/final_reward": 1.1968374017252494, "rewards/mask_iou_reward": 0.5984187008626247, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3618130087852478, "rewards/thk_ans_format_reward": 1.0, "step": 828, "think_completion_length": 54.53125 }, { "clip_ratio": 0.0, "completion_length": 129.625, "epoch": 1.399662731871838, "grad_norm": 12.898067805148317, "kl": 0.4130859375, "learning_rate": 7.204047217537942e-07, "loss": 0.0004, "reward": 3.607871890068054, "reward_std": 0.12004952877759933, "rewards/final_reward": 1.4956458828207402, "rewards/mask_iou_reward": 0.7478229414103701, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.607871949672699, "rewards/thk_ans_format_reward": 1.0, "step": 829, "think_completion_length": 52.25 }, { "clip_ratio": 0.0, "completion_length": 131.0, "epoch": 1.4013490725126476, "grad_norm": 9.22840356229312, "kl": 0.42578125, "learning_rate": 7.200674536256324e-07, "loss": 0.0004, "reward": 2.9152538776397705, "reward_std": 0.16578126698732376, "rewards/final_reward": 1.0610994051270966, "rewards/mask_iou_reward": 0.5305497025635483, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9152538776397705, "rewards/thk_ans_format_reward": 1.0, "step": 830, "think_completion_length": 54.90625 }, { "clip_ratio": 0.0, "completion_length": 158.734375, "epoch": 1.403035413153457, "grad_norm": 15.11425361744145, "kl": 0.3671875, "learning_rate": 7.197301854974705e-07, "loss": 0.0004, "reward": 3.1335614919662476, "reward_std": 0.2536454573273659, "rewards/final_reward": 1.614278669479924, "rewards/mask_iou_reward": 0.807139334739962, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1335614323616028, "rewards/thk_ans_format_reward": 1.0, "step": 831, "think_completion_length": 59.15625 }, { "clip_ratio": 0.0, "completion_length": 126.359375, "epoch": 1.4047217537942664, "grad_norm": 13.660412359550188, "kl": 0.4609375, "learning_rate": 7.193929173693086e-07, "loss": 0.0005, "reward": 3.0477449893951416, "reward_std": 0.28462807834148407, "rewards/final_reward": 1.306179247026351, "rewards/mask_iou_reward": 0.6530896235131755, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.0633699893951416, "rewards/thk_ans_format_reward": 1.0, "step": 832, "think_completion_length": 58.34375 }, { "clip_ratio": 0.0, "completion_length": 121.609375, "epoch": 1.4064080944350759, "grad_norm": 4.762614594252812, "kl": 0.4189453125, "learning_rate": 7.190556492411467e-07, "loss": 0.0004, "reward": 3.0881171226501465, "reward_std": 0.2660984881222248, "rewards/final_reward": 0.7231560324770698, "rewards/mask_iou_reward": 0.3615780162385349, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0881169438362122, "rewards/thk_ans_format_reward": 1.0, "step": 833, "think_completion_length": 60.3125 }, { "clip_ratio": 0.0, "completion_length": 124.96875, "epoch": 1.4080944350758853, "grad_norm": 157.51508190258878, "kl": 0.490234375, "learning_rate": 7.187183811129847e-07, "loss": 0.0005, "reward": 3.0769537687301636, "reward_std": 0.15264162048697472, "rewards/final_reward": 1.3258674626582714, "rewards/mask_iou_reward": 0.6629337313291357, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0769538879394531, "rewards/thk_ans_format_reward": 1.0, "step": 834, "think_completion_length": 50.53125 }, { "clip_ratio": 0.0, "completion_length": 131.484375, "epoch": 1.4097807757166947, "grad_norm": 20.709158949497038, "kl": 0.4609375, "learning_rate": 7.183811129848229e-07, "loss": 0.0005, "reward": 3.406615734100342, "reward_std": 0.07765450701117516, "rewards/final_reward": 1.1670628162854402, "rewards/mask_iou_reward": 0.5835314081427201, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.406615674495697, "rewards/thk_ans_format_reward": 1.0, "step": 835, "think_completion_length": 57.1875 }, { "clip_ratio": 0.0, "completion_length": 139.359375, "epoch": 1.411467116357504, "grad_norm": 14.63598679525673, "kl": 0.4140625, "learning_rate": 7.18043844856661e-07, "loss": 0.0004, "reward": 3.6451518535614014, "reward_std": 0.3170605003833771, "rewards/final_reward": 1.7678322310469072, "rewards/mask_iou_reward": 0.8839161155234536, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6451519131660461, "rewards/thk_ans_format_reward": 1.0, "step": 836, "think_completion_length": 58.03125 }, { "clip_ratio": 0.0, "completion_length": 135.8125, "epoch": 1.4131534569983137, "grad_norm": 18.364809498373194, "kl": 0.4267578125, "learning_rate": 7.177065767284991e-07, "loss": 0.0004, "reward": 3.175464391708374, "reward_std": 0.21890763938426971, "rewards/final_reward": 1.4261317621170528, "rewards/mask_iou_reward": 0.7130658810585264, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.175464391708374, "rewards/thk_ans_format_reward": 1.0, "step": 837, "think_completion_length": 59.75 }, { "clip_ratio": 0.0, "completion_length": 129.796875, "epoch": 1.4148397976391232, "grad_norm": 4.826306279345156, "kl": 0.4453125, "learning_rate": 7.173693086003373e-07, "loss": 0.0004, "reward": 3.2676570415496826, "reward_std": 0.07519757375121117, "rewards/final_reward": 0.8388647625645046, "rewards/mask_iou_reward": 0.4194323812822523, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2676568925380707, "rewards/thk_ans_format_reward": 1.0, "step": 838, "think_completion_length": 55.96875 }, { "clip_ratio": 0.0, "completion_length": 150.96875, "epoch": 1.4165261382799326, "grad_norm": 4.5560595609149495, "kl": 0.400390625, "learning_rate": 7.170320404721754e-07, "loss": 0.0004, "reward": 3.2105181217193604, "reward_std": 0.10261780396103859, "rewards/final_reward": 0.9419267024011992, "rewards/mask_iou_reward": 0.4709633512005996, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2105180025100708, "rewards/thk_ans_format_reward": 1.0, "step": 839, "think_completion_length": 55.25 }, { "clip_ratio": 0.0, "completion_length": 143.796875, "epoch": 1.418212478920742, "grad_norm": 5.034386179760682, "kl": 0.439453125, "learning_rate": 7.166947723440135e-07, "loss": 0.0004, "reward": 2.976960778236389, "reward_std": 0.08269162010401487, "rewards/final_reward": 0.4845235261759516, "rewards/mask_iou_reward": 0.2422617630879758, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9769608080387115, "rewards/thk_ans_format_reward": 1.0, "step": 840, "think_completion_length": 54.84375 }, { "clip_ratio": 0.0, "completion_length": 120.09375, "epoch": 1.4198988195615514, "grad_norm": 16.873982914654313, "kl": 0.513671875, "learning_rate": 7.163575042158516e-07, "loss": 0.0005, "reward": 3.330239415168762, "reward_std": 0.09310411475598812, "rewards/final_reward": 0.9805123238516863, "rewards/mask_iou_reward": 0.49025616192584315, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3302394151687622, "rewards/thk_ans_format_reward": 1.0, "step": 841, "think_completion_length": 52.09375 }, { "clip_ratio": 0.0, "completion_length": 196.578125, "epoch": 1.4215851602023608, "grad_norm": 21.973761932873924, "kl": 0.330078125, "learning_rate": 7.160202360876897e-07, "loss": 0.0003, "reward": 3.7035324573516846, "reward_std": 0.18692347779870033, "rewards/final_reward": 1.8463038185252576, "rewards/mask_iou_reward": 0.9231519092626288, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.7191572785377502, "rewards/thk_ans_format_reward": 1.0, "step": 842, "think_completion_length": 51.21875 }, { "clip_ratio": 0.0, "completion_length": 123.921875, "epoch": 1.4232715008431702, "grad_norm": 3.8783438591055797, "kl": 0.453125, "learning_rate": 7.156829679595277e-07, "loss": 0.0005, "reward": 3.3667051792144775, "reward_std": 0.13266583997756243, "rewards/final_reward": 1.1172658691646342, "rewards/mask_iou_reward": 0.5586329345823171, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3667052388191223, "rewards/thk_ans_format_reward": 1.0, "step": 843, "think_completion_length": 54.15625 }, { "clip_ratio": 0.0, "completion_length": 134.5, "epoch": 1.4249578414839799, "grad_norm": 5.68908269467362, "kl": 0.4140625, "learning_rate": 7.153456998313659e-07, "loss": 0.0004, "reward": 3.704403042793274, "reward_std": 0.12302776426076889, "rewards/final_reward": 1.7305376594747914, "rewards/mask_iou_reward": 0.8652688297373957, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.704403042793274, "rewards/thk_ans_format_reward": 1.0, "step": 844, "think_completion_length": 48.53125 }, { "clip_ratio": 0.0, "completion_length": 143.671875, "epoch": 1.4266441821247893, "grad_norm": 7.873233196363575, "kl": 0.4736328125, "learning_rate": 7.15008431703204e-07, "loss": 0.0005, "reward": 3.1094895601272583, "reward_std": 0.056312352418899536, "rewards/final_reward": 0.7561530607928131, "rewards/mask_iou_reward": 0.37807653039640654, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1094896793365479, "rewards/thk_ans_format_reward": 1.0, "step": 845, "think_completion_length": 58.0 }, { "clip_ratio": 0.0, "completion_length": 146.953125, "epoch": 1.4283305227655987, "grad_norm": 5.862238379381449, "kl": 0.4033203125, "learning_rate": 7.146711635750421e-07, "loss": 0.0004, "reward": 2.8694413900375366, "reward_std": 0.26014316687360406, "rewards/final_reward": 0.23614428593740902, "rewards/mask_iou_reward": 0.11807214296870451, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.8850663900375366, "rewards/thk_ans_format_reward": 1.0, "step": 846, "think_completion_length": 55.875 }, { "clip_ratio": 0.0, "completion_length": 121.125, "epoch": 1.430016863406408, "grad_norm": 97.83723017637423, "kl": 0.4453125, "learning_rate": 7.143338954468803e-07, "loss": 0.0005, "reward": 2.65997576713562, "reward_std": 0.33230482041835785, "rewards/final_reward": 0.5149375789828794, "rewards/mask_iou_reward": 0.2574687894914397, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6599756479263306, "rewards/thk_ans_format_reward": 1.0, "step": 847, "think_completion_length": 53.125 }, { "clip_ratio": 0.0, "completion_length": 125.6875, "epoch": 1.4317032040472175, "grad_norm": 15.2524038009595, "kl": 0.4248046875, "learning_rate": 7.139966273187184e-07, "loss": 0.0004, "reward": 3.162472367286682, "reward_std": 0.2069089524447918, "rewards/final_reward": 1.571867908834975, "rewards/mask_iou_reward": 0.7859339544174875, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1624723076820374, "rewards/thk_ans_format_reward": 1.0, "step": 848, "think_completion_length": 49.78125 }, { "clip_ratio": 0.0, "completion_length": 124.0625, "epoch": 1.433389544688027, "grad_norm": 11.31711407452566, "kl": 0.509765625, "learning_rate": 7.136593591905564e-07, "loss": 0.0005, "reward": 3.353234648704529, "reward_std": 0.3007300794124603, "rewards/final_reward": 1.415233789323357, "rewards/mask_iou_reward": 0.7076168946616785, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3532347679138184, "rewards/thk_ans_format_reward": 1.0, "step": 849, "think_completion_length": 58.09375 }, { "clip_ratio": 0.0, "completion_length": 123.5, "epoch": 1.4350758853288363, "grad_norm": 8.058228701945565, "kl": 0.4248046875, "learning_rate": 7.133220910623946e-07, "loss": 0.0005, "reward": 3.258637309074402, "reward_std": 0.028607182670384645, "rewards/final_reward": 1.4189250618857554, "rewards/mask_iou_reward": 0.7094625309428777, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2586371302604675, "rewards/thk_ans_format_reward": 1.0, "step": 850, "think_completion_length": 51.75 }, { "clip_ratio": 0.0, "completion_length": 126.734375, "epoch": 1.436762225969646, "grad_norm": 9.547448920950147, "kl": 0.50390625, "learning_rate": 7.129848229342327e-07, "loss": 0.0005, "reward": 2.6284313201904297, "reward_std": 0.20265497267246246, "rewards/final_reward": 0.8259624311973379, "rewards/mask_iou_reward": 0.41298121559866896, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6284312754869461, "rewards/thk_ans_format_reward": 1.0, "step": 851, "think_completion_length": 60.28125 }, { "clip_ratio": 0.0, "completion_length": 139.6875, "epoch": 1.4384485666104554, "grad_norm": 13.224954231135428, "kl": 0.474609375, "learning_rate": 7.126475548060707e-07, "loss": 0.0005, "reward": 2.9994101524353027, "reward_std": 0.12460505217313766, "rewards/final_reward": 1.8945995775852573, "rewards/mask_iou_reward": 0.9472997887926287, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9994100630283356, "rewards/thk_ans_format_reward": 1.0, "step": 852, "think_completion_length": 53.875 }, { "clip_ratio": 0.0, "completion_length": 123.40625, "epoch": 1.4401349072512648, "grad_norm": 8.569023489569883, "kl": 0.458984375, "learning_rate": 7.123102866779089e-07, "loss": 0.0005, "reward": 3.078709602355957, "reward_std": 0.11330131255090237, "rewards/final_reward": 0.8808447919914785, "rewards/mask_iou_reward": 0.44042239599573924, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0787094831466675, "rewards/thk_ans_format_reward": 1.0, "step": 853, "think_completion_length": 48.15625 }, { "clip_ratio": 0.0, "completion_length": 145.21875, "epoch": 1.4418212478920742, "grad_norm": 4.642482646726498, "kl": 0.521484375, "learning_rate": 7.11973018549747e-07, "loss": 0.0005, "reward": 3.0897263288497925, "reward_std": 0.2790881544351578, "rewards/final_reward": 0.9708697595854819, "rewards/mask_iou_reward": 0.48543487979274097, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0897262394428253, "rewards/thk_ans_format_reward": 1.0, "step": 854, "think_completion_length": 48.90625 }, { "clip_ratio": 0.0, "completion_length": 135.75, "epoch": 1.4435075885328836, "grad_norm": 6.027895450430307, "kl": 0.41796875, "learning_rate": 7.116357504215851e-07, "loss": 0.0004, "reward": 3.2061294317245483, "reward_std": 0.1781318113207817, "rewards/final_reward": 0.8162860726709539, "rewards/mask_iou_reward": 0.40814303633547694, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.206129550933838, "rewards/thk_ans_format_reward": 1.0, "step": 855, "think_completion_length": 58.46875 }, { "clip_ratio": 0.0, "completion_length": 132.828125, "epoch": 1.445193929173693, "grad_norm": 4.937207974636539, "kl": 0.4892578125, "learning_rate": 7.112984822934233e-07, "loss": 0.0005, "reward": 3.273247718811035, "reward_std": 0.44166913628578186, "rewards/final_reward": 1.2405309436027023, "rewards/mask_iou_reward": 0.6202654718013512, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2732477188110352, "rewards/thk_ans_format_reward": 1.0, "step": 856, "think_completion_length": 51.53125 }, { "clip_ratio": 0.0, "completion_length": 122.359375, "epoch": 1.4468802698145025, "grad_norm": 4.498629741082507, "kl": 0.447265625, "learning_rate": 7.109612141652614e-07, "loss": 0.0004, "reward": 3.0108615159988403, "reward_std": 0.3564150631427765, "rewards/final_reward": 0.9811074353066257, "rewards/mask_iou_reward": 0.49055371765331285, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 1.0421114563941956, "rewards/thk_ans_format_reward": 1.0, "step": 857, "think_completion_length": 49.8125 }, { "clip_ratio": 0.0, "completion_length": 130.9375, "epoch": 1.448566610455312, "grad_norm": 40.5484481050642, "kl": 0.4423828125, "learning_rate": 7.106239460370994e-07, "loss": 0.0004, "reward": 3.179835796356201, "reward_std": 0.16212757676839828, "rewards/final_reward": 1.2437849535160206, "rewards/mask_iou_reward": 0.6218924767580103, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1798357963562012, "rewards/thk_ans_format_reward": 1.0, "step": 858, "think_completion_length": 57.8125 }, { "clip_ratio": 0.0, "completion_length": 125.21875, "epoch": 1.4502529510961213, "grad_norm": 9.542311459129479, "kl": 0.501953125, "learning_rate": 7.102866779089376e-07, "loss": 0.0005, "reward": 3.281885504722595, "reward_std": 0.40248236060142517, "rewards/final_reward": 1.2012127911041488, "rewards/mask_iou_reward": 0.6006063955520744, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2818855047225952, "rewards/thk_ans_format_reward": 1.0, "step": 859, "think_completion_length": 55.4375 }, { "clip_ratio": 0.0, "completion_length": 110.84375, "epoch": 1.451939291736931, "grad_norm": 4.3575825018615495, "kl": 0.4375, "learning_rate": 7.099494097807756e-07, "loss": 0.0004, "reward": 3.309635281562805, "reward_std": 0.275321364402771, "rewards/final_reward": 1.567287356990522, "rewards/mask_iou_reward": 0.783643678495261, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.309635192155838, "rewards/thk_ans_format_reward": 1.0, "step": 860, "think_completion_length": 49.40625 }, { "clip_ratio": 0.0, "completion_length": 130.5, "epoch": 1.4536256323777403, "grad_norm": 17.364544031528975, "kl": 0.4736328125, "learning_rate": 7.096121416526138e-07, "loss": 0.0005, "reward": 3.50772488117218, "reward_std": 0.06305067986249924, "rewards/final_reward": 1.5693529211308035, "rewards/mask_iou_reward": 0.7846764605654017, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5077248811721802, "rewards/thk_ans_format_reward": 1.0, "step": 861, "think_completion_length": 47.90625 }, { "clip_ratio": 0.0, "completion_length": 156.078125, "epoch": 1.4553119730185498, "grad_norm": 9.445189061831712, "kl": 0.431640625, "learning_rate": 7.092748735244519e-07, "loss": 0.0004, "reward": 3.5429933071136475, "reward_std": 0.17221157252788544, "rewards/final_reward": 1.6822896347621752, "rewards/mask_iou_reward": 0.8411448173810876, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5429933071136475, "rewards/thk_ans_format_reward": 1.0, "step": 862, "think_completion_length": 53.9375 }, { "clip_ratio": 0.0, "completion_length": 122.875, "epoch": 1.4569983136593592, "grad_norm": 59.79920462584689, "kl": 0.44140625, "learning_rate": 7.0893760539629e-07, "loss": 0.0004, "reward": 3.3259243965148926, "reward_std": 0.2988605722784996, "rewards/final_reward": 1.1951013711153204, "rewards/mask_iou_reward": 0.5975506855576602, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3259243965148926, "rewards/thk_ans_format_reward": 1.0, "step": 863, "think_completion_length": 53.375 }, { "clip_ratio": 0.0, "completion_length": 119.375, "epoch": 1.4586846543001686, "grad_norm": 10.79964206810661, "kl": 0.5322265625, "learning_rate": 7.086003372681282e-07, "loss": 0.0006, "reward": 2.977699041366577, "reward_std": 0.10583911696448922, "rewards/final_reward": 1.0417962258507818, "rewards/mask_iou_reward": 0.5208981129253909, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.97769895195961, "rewards/thk_ans_format_reward": 1.0, "step": 864, "think_completion_length": 48.0 }, { "clip_ratio": 0.0, "completion_length": 119.59375, "epoch": 1.460370994940978, "grad_norm": 18.176945366133, "kl": 0.4658203125, "learning_rate": 7.082630691399663e-07, "loss": 0.0005, "reward": 3.491799473762512, "reward_std": 0.06794925779104233, "rewards/final_reward": 1.199618176053284, "rewards/mask_iou_reward": 0.599809088026642, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.491799533367157, "rewards/thk_ans_format_reward": 1.0, "step": 865, "think_completion_length": 46.3125 }, { "clip_ratio": 0.0, "completion_length": 118.6875, "epoch": 1.4620573355817874, "grad_norm": 19.21865624105213, "kl": 0.4833984375, "learning_rate": 7.079258010118043e-07, "loss": 0.0005, "reward": 3.5655884742736816, "reward_std": 0.22954148054122925, "rewards/final_reward": 1.7901023068675446, "rewards/mask_iou_reward": 0.8950511534337723, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5655885338783264, "rewards/thk_ans_format_reward": 1.0, "step": 866, "think_completion_length": 52.125 }, { "clip_ratio": 0.0, "completion_length": 119.671875, "epoch": 1.463743676222597, "grad_norm": 11.208424920176157, "kl": 0.443359375, "learning_rate": 7.075885328836425e-07, "loss": 0.0004, "reward": 3.2776330709457397, "reward_std": 0.1639660745859146, "rewards/final_reward": 1.0371694517989962, "rewards/mask_iou_reward": 0.5185847258994981, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2776331305503845, "rewards/thk_ans_format_reward": 1.0, "step": 867, "think_completion_length": 48.9375 }, { "clip_ratio": 0.0, "completion_length": 120.0625, "epoch": 1.4654300168634065, "grad_norm": 11.883223028850797, "kl": 0.4833984375, "learning_rate": 7.072512647554806e-07, "loss": 0.0005, "reward": 3.4130160808563232, "reward_std": 0.2615826725959778, "rewards/final_reward": 1.2130193291082727, "rewards/mask_iou_reward": 0.6065096645541364, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.413015902042389, "rewards/thk_ans_format_reward": 1.0, "step": 868, "think_completion_length": 54.1875 }, { "clip_ratio": 0.0, "completion_length": 126.265625, "epoch": 1.4671163575042159, "grad_norm": 32.80673354192309, "kl": 0.44140625, "learning_rate": 7.069139966273186e-07, "loss": 0.0004, "reward": 2.798851251602173, "reward_std": 0.08375886641442776, "rewards/final_reward": 0.5250274420052581, "rewards/mask_iou_reward": 0.26251372100262904, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7988512814044952, "rewards/thk_ans_format_reward": 1.0, "step": 869, "think_completion_length": 54.46875 }, { "clip_ratio": 0.0, "completion_length": 152.3125, "epoch": 1.4688026981450253, "grad_norm": 13.261868917995077, "kl": 0.462890625, "learning_rate": 7.065767284991568e-07, "loss": 0.0005, "reward": 2.923264503479004, "reward_std": 0.25585998594760895, "rewards/final_reward": 0.6537531827199606, "rewards/mask_iou_reward": 0.3268765913599803, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9232644140720367, "rewards/thk_ans_format_reward": 1.0, "step": 870, "think_completion_length": 54.6875 }, { "clip_ratio": 0.0, "completion_length": 143.515625, "epoch": 1.4704890387858347, "grad_norm": 5.530281082751846, "kl": 0.4130859375, "learning_rate": 7.062394603709949e-07, "loss": 0.0004, "reward": 3.422648787498474, "reward_std": 0.2950022518634796, "rewards/final_reward": 1.0513954006982185, "rewards/mask_iou_reward": 0.5256977003491092, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.422648847103119, "rewards/thk_ans_format_reward": 1.0, "step": 871, "think_completion_length": 48.65625 }, { "clip_ratio": 0.0, "completion_length": 149.078125, "epoch": 1.4721753794266441, "grad_norm": 7.472701314041896, "kl": 0.4140625, "learning_rate": 7.05902192242833e-07, "loss": 0.0004, "reward": 3.1202709674835205, "reward_std": 0.0359388068318367, "rewards/final_reward": 0.9082897754423601, "rewards/mask_iou_reward": 0.45414488772118006, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1202709674835205, "rewards/thk_ans_format_reward": 1.0, "step": 872, "think_completion_length": 52.125 }, { "clip_ratio": 0.0, "completion_length": 126.453125, "epoch": 1.4738617200674535, "grad_norm": 9.384000312254551, "kl": 0.5283203125, "learning_rate": 7.055649241146712e-07, "loss": 0.0005, "reward": 3.0585156679153442, "reward_std": 0.1298949345946312, "rewards/final_reward": 1.1770550045989658, "rewards/mask_iou_reward": 0.5885275022994829, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0741406977176666, "rewards/thk_ans_format_reward": 0.984375, "step": 873, "think_completion_length": 50.09375 }, { "clip_ratio": 0.0, "completion_length": 127.359375, "epoch": 1.4755480607082632, "grad_norm": 10.702095709503649, "kl": 0.4658203125, "learning_rate": 7.052276559865092e-07, "loss": 0.0005, "reward": 3.082894802093506, "reward_std": 0.22249618917703629, "rewards/final_reward": 1.1370577026434994, "rewards/mask_iou_reward": 0.5685288513217497, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0828947126865387, "rewards/thk_ans_format_reward": 1.0, "step": 874, "think_completion_length": 50.4375 }, { "clip_ratio": 0.0, "completion_length": 127.59375, "epoch": 1.4772344013490726, "grad_norm": 9.649217015576328, "kl": 0.443359375, "learning_rate": 7.048903878583473e-07, "loss": 0.0004, "reward": 2.738980293273926, "reward_std": 0.2116129845380783, "rewards/final_reward": 0.9144510817754059, "rewards/mask_iou_reward": 0.45722554088770295, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7389805316925049, "rewards/thk_ans_format_reward": 1.0, "step": 875, "think_completion_length": 60.75 }, { "clip_ratio": 0.0, "completion_length": 196.75, "epoch": 1.478920741989882, "grad_norm": 6.049179427563422, "kl": 0.412109375, "learning_rate": 7.045531197301855e-07, "loss": 0.0004, "reward": 3.134316563606262, "reward_std": 0.2520030327141285, "rewards/final_reward": 1.0917850139519656, "rewards/mask_iou_reward": 0.5458925069759828, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1499416530132294, "rewards/thk_ans_format_reward": 0.984375, "step": 876, "think_completion_length": 59.0 }, { "clip_ratio": 0.0, "completion_length": 124.703125, "epoch": 1.4806070826306914, "grad_norm": 6.690438754177215, "kl": 0.466796875, "learning_rate": 7.042158516020236e-07, "loss": 0.0005, "reward": 3.6443511247634888, "reward_std": 0.09915501996874809, "rewards/final_reward": 1.5526876263785594, "rewards/mask_iou_reward": 0.7763438131892797, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.644351065158844, "rewards/thk_ans_format_reward": 1.0, "step": 877, "think_completion_length": 56.53125 }, { "clip_ratio": 0.0, "completion_length": 123.34375, "epoch": 1.4822934232715008, "grad_norm": 13.592751851346678, "kl": 0.52734375, "learning_rate": 7.038785834738616e-07, "loss": 0.0005, "reward": 3.3902156352996826, "reward_std": 0.13225466385483742, "rewards/final_reward": 1.526574811986646, "rewards/mask_iou_reward": 0.763287405993323, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.390215516090393, "rewards/thk_ans_format_reward": 1.0, "step": 878, "think_completion_length": 46.75 }, { "clip_ratio": 0.0, "completion_length": 129.296875, "epoch": 1.4839797639123102, "grad_norm": 5.0377686056084166, "kl": 0.4716796875, "learning_rate": 7.035413153456998e-07, "loss": 0.0005, "reward": 3.232161045074463, "reward_std": 0.12451484799385071, "rewards/final_reward": 0.8035639148908846, "rewards/mask_iou_reward": 0.4017819574454423, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2321611642837524, "rewards/thk_ans_format_reward": 1.0, "step": 879, "think_completion_length": 47.5625 }, { "clip_ratio": 0.0, "completion_length": 151.03125, "epoch": 1.4856661045531196, "grad_norm": 5.453689899823007, "kl": 0.42578125, "learning_rate": 7.032040472175379e-07, "loss": 0.0004, "reward": 3.3886868953704834, "reward_std": 0.15230849012732506, "rewards/final_reward": 1.1831722336546733, "rewards/mask_iou_reward": 0.5915861168273366, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3886866569519043, "rewards/thk_ans_format_reward": 1.0, "step": 880, "think_completion_length": 52.09375 }, { "clip_ratio": 0.0, "completion_length": 136.71875, "epoch": 1.4873524451939293, "grad_norm": 10.350225069844578, "kl": 0.41015625, "learning_rate": 7.02866779089376e-07, "loss": 0.0004, "reward": 3.2247852087020874, "reward_std": 0.22761035338044167, "rewards/final_reward": 1.3909674394845886, "rewards/mask_iou_reward": 0.6954837197422943, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2247852683067322, "rewards/thk_ans_format_reward": 1.0, "step": 881, "think_completion_length": 56.78125 }, { "clip_ratio": 0.0, "completion_length": 122.328125, "epoch": 1.4890387858347387, "grad_norm": 4.851130451842982, "kl": 0.4716796875, "learning_rate": 7.025295109612142e-07, "loss": 0.0005, "reward": 3.4963871240615845, "reward_std": 0.19990779552608728, "rewards/final_reward": 1.3284091362959223, "rewards/mask_iou_reward": 0.6642045681479611, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4963870644569397, "rewards/thk_ans_format_reward": 1.0, "step": 882, "think_completion_length": 55.90625 }, { "clip_ratio": 0.0, "completion_length": 123.421875, "epoch": 1.4907251264755481, "grad_norm": 4.651181463731348, "kl": 0.482421875, "learning_rate": 7.021922428330522e-07, "loss": 0.0005, "reward": 3.3203155994415283, "reward_std": 0.15234145522117615, "rewards/final_reward": 0.845040981767172, "rewards/mask_iou_reward": 0.422520490883586, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.320315659046173, "rewards/thk_ans_format_reward": 1.0, "step": 883, "think_completion_length": 55.09375 }, { "clip_ratio": 0.0, "completion_length": 133.703125, "epoch": 1.4924114671163575, "grad_norm": 5.863150136793525, "kl": 0.44921875, "learning_rate": 7.018549747048903e-07, "loss": 0.0005, "reward": 3.320623278617859, "reward_std": 0.1672058179974556, "rewards/final_reward": 1.8739585309298803, "rewards/mask_iou_reward": 0.9369792654649401, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3206232190132141, "rewards/thk_ans_format_reward": 1.0, "step": 884, "think_completion_length": 53.96875 }, { "clip_ratio": 0.0, "completion_length": 129.515625, "epoch": 1.494097807757167, "grad_norm": 6.021223525582373, "kl": 0.5, "learning_rate": 7.015177065767285e-07, "loss": 0.0005, "reward": 3.579367160797119, "reward_std": 0.35837820172309875, "rewards/final_reward": 1.5228644932419437, "rewards/mask_iou_reward": 0.7614322466209719, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.594992220401764, "rewards/thk_ans_format_reward": 1.0, "step": 885, "think_completion_length": 61.0625 }, { "clip_ratio": 0.0, "completion_length": 159.984375, "epoch": 1.4957841483979764, "grad_norm": 6.5531915249452215, "kl": 0.3818359375, "learning_rate": 7.011804384485666e-07, "loss": 0.0004, "reward": 2.4116973876953125, "reward_std": 0.24532928317785263, "rewards/final_reward": 0.09217479847421674, "rewards/mask_iou_reward": 0.04608739923710837, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.4116973280906677, "rewards/thk_ans_format_reward": 1.0, "step": 886, "think_completion_length": 53.46875 }, { "clip_ratio": 0.0, "completion_length": 129.21875, "epoch": 1.4974704890387858, "grad_norm": 22.897900079081385, "kl": 0.4697265625, "learning_rate": 7.008431703204047e-07, "loss": 0.0005, "reward": 3.2448573112487793, "reward_std": 0.3474937528371811, "rewards/final_reward": 1.0248634815796156, "rewards/mask_iou_reward": 0.5124317407898078, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.244857370853424, "rewards/thk_ans_format_reward": 1.0, "step": 887, "think_completion_length": 58.3125 }, { "clip_ratio": 0.0, "completion_length": 130.375, "epoch": 1.4991568296795954, "grad_norm": 26.551364418368514, "kl": 0.439453125, "learning_rate": 7.005059021922428e-07, "loss": 0.0004, "reward": 3.463243246078491, "reward_std": 0.481712244451046, "rewards/final_reward": 1.6564740215537062, "rewards/mask_iou_reward": 0.8282370107768531, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.4788681864738464, "rewards/thk_ans_format_reward": 1.0, "step": 888, "think_completion_length": 52.375 }, { "clip_ratio": 0.0, "completion_length": 126.375, "epoch": 1.5008431703204046, "grad_norm": 25.417332706180506, "kl": 0.470703125, "learning_rate": 7.001686340640809e-07, "loss": 0.0005, "reward": 2.934293508529663, "reward_std": 0.1236649677157402, "rewards/final_reward": 1.0390162157073441, "rewards/mask_iou_reward": 0.5195081078536721, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9342935383319855, "rewards/thk_ans_format_reward": 1.0, "step": 889, "think_completion_length": 52.15625 }, { "clip_ratio": 0.0, "completion_length": 146.25, "epoch": 1.5025295109612142, "grad_norm": 7.780530546518989, "kl": 0.544921875, "learning_rate": 6.998313659359191e-07, "loss": 0.0005, "reward": 3.188772201538086, "reward_std": 0.2717321440577507, "rewards/final_reward": 0.9025040048437754, "rewards/mask_iou_reward": 0.4512520024218877, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.188772201538086, "rewards/thk_ans_format_reward": 1.0, "step": 890, "think_completion_length": 56.34375 }, { "clip_ratio": 0.0, "completion_length": 137.390625, "epoch": 1.5042158516020236, "grad_norm": 6.45209714621979, "kl": 0.44921875, "learning_rate": 6.994940978077571e-07, "loss": 0.0005, "reward": 3.103861451148987, "reward_std": 0.1689981073141098, "rewards/final_reward": 1.4753462516466826, "rewards/mask_iou_reward": 0.7376731258233413, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1038614511489868, "rewards/thk_ans_format_reward": 1.0, "step": 891, "think_completion_length": 45.875 }, { "clip_ratio": 0.0, "completion_length": 125.96875, "epoch": 1.505902192242833, "grad_norm": 28.14646763114755, "kl": 0.439453125, "learning_rate": 6.991568296795952e-07, "loss": 0.0004, "reward": 3.271444320678711, "reward_std": 0.038284238427877426, "rewards/final_reward": 1.3904845505095014, "rewards/mask_iou_reward": 0.6952422752547507, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2714443802833557, "rewards/thk_ans_format_reward": 1.0, "step": 892, "think_completion_length": 52.75 }, { "clip_ratio": 0.0, "completion_length": 130.875, "epoch": 1.5075885328836425, "grad_norm": 7.348843640714404, "kl": 0.4296875, "learning_rate": 6.988195615514334e-07, "loss": 0.0004, "reward": 3.3650012016296387, "reward_std": 0.23928017914295197, "rewards/final_reward": 1.1050898513339247, "rewards/mask_iou_reward": 0.5525449256669623, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.365001142024994, "rewards/thk_ans_format_reward": 1.0, "step": 893, "think_completion_length": 62.09375 }, { "clip_ratio": 0.0, "completion_length": 128.6875, "epoch": 1.5092748735244519, "grad_norm": 58.76319880638057, "kl": 0.513671875, "learning_rate": 6.984822934232715e-07, "loss": 0.0005, "reward": 3.389597177505493, "reward_std": 0.2816731631755829, "rewards/final_reward": 1.569462222983449, "rewards/mask_iou_reward": 0.7847311114917245, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3895971775054932, "rewards/thk_ans_format_reward": 1.0, "step": 894, "think_completion_length": 48.84375 }, { "clip_ratio": 0.0, "completion_length": 122.90625, "epoch": 1.5109612141652615, "grad_norm": 5.146659429828279, "kl": 0.46484375, "learning_rate": 6.981450252951096e-07, "loss": 0.0005, "reward": 3.0576419830322266, "reward_std": 0.19554530084133148, "rewards/final_reward": 0.757199370585173, "rewards/mask_iou_reward": 0.3785996852925865, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0576421320438385, "rewards/thk_ans_format_reward": 1.0, "step": 895, "think_completion_length": 56.0 }, { "clip_ratio": 0.0, "completion_length": 129.5625, "epoch": 1.5126475548060707, "grad_norm": 8.009726459732972, "kl": 0.490234375, "learning_rate": 6.978077571669477e-07, "loss": 0.0005, "reward": 2.517244815826416, "reward_std": 0.1560732051730156, "rewards/final_reward": 0.7029032977709968, "rewards/mask_iou_reward": 0.3514516488854984, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5172448828816414, "rewards/thk_ans_format_reward": 1.0, "step": 896, "think_completion_length": 53.78125 }, { "clip_ratio": 0.0, "completion_length": 136.25, "epoch": 1.5143338954468804, "grad_norm": 11.691308332853232, "kl": 0.439453125, "learning_rate": 6.974704890387858e-07, "loss": 0.0004, "reward": 2.8864370584487915, "reward_std": 0.3114437907934189, "rewards/final_reward": 1.1898409841779651, "rewards/mask_iou_reward": 0.5949204920889826, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8864371180534363, "rewards/thk_ans_format_reward": 1.0, "step": 897, "think_completion_length": 54.09375 }, { "clip_ratio": 0.0, "completion_length": 126.828125, "epoch": 1.5160202360876898, "grad_norm": 20.22786026987595, "kl": 0.4736328125, "learning_rate": 6.971332209106239e-07, "loss": 0.0005, "reward": 3.25240159034729, "reward_std": 0.38694237172603607, "rewards/final_reward": 1.4252261662740588, "rewards/mask_iou_reward": 0.7126130831370294, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.2680267095565796, "rewards/thk_ans_format_reward": 1.0, "step": 898, "think_completion_length": 54.46875 }, { "clip_ratio": 0.0, "completion_length": 131.09375, "epoch": 1.5177065767284992, "grad_norm": 4.799547084634126, "kl": 1.21484375, "learning_rate": 6.96795952782462e-07, "loss": 0.0012, "reward": 3.003826141357422, "reward_std": 0.3299378901720047, "rewards/final_reward": 0.8713669023703968, "rewards/mask_iou_reward": 0.4356834511851984, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.0194511413574219, "rewards/thk_ans_format_reward": 1.0, "step": 899, "think_completion_length": 51.6875 }, { "clip_ratio": 0.0, "completion_length": 128.015625, "epoch": 1.5193929173693086, "grad_norm": 8.586444312018184, "kl": 0.447265625, "learning_rate": 6.964586846543001e-07, "loss": 0.0004, "reward": 3.177174210548401, "reward_std": 0.21209881751565263, "rewards/final_reward": 1.8933416244051449, "rewards/mask_iou_reward": 0.9466708122025724, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1771742403507233, "rewards/thk_ans_format_reward": 1.0, "step": 900, "think_completion_length": 56.625 }, { "clip_ratio": 0.0, "completion_length": 138.890625, "epoch": 1.521079258010118, "grad_norm": 6.139072484754312, "kl": 0.421875, "learning_rate": 6.961214165261382e-07, "loss": 0.0004, "reward": 2.9001829624176025, "reward_std": 0.11913806945085526, "rewards/final_reward": 1.1471034366880408, "rewards/mask_iou_reward": 0.5735517183440204, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9001829624176025, "rewards/thk_ans_format_reward": 1.0, "step": 901, "think_completion_length": 56.875 }, { "clip_ratio": 0.0, "completion_length": 128.984375, "epoch": 1.5227655986509276, "grad_norm": 38.17727674571188, "kl": 0.435546875, "learning_rate": 6.957841483979764e-07, "loss": 0.0004, "reward": 3.7425217628479004, "reward_std": 0.033854744397103786, "rewards/final_reward": 1.688420361475836, "rewards/mask_iou_reward": 0.844210180737918, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7425217628479004, "rewards/thk_ans_format_reward": 1.0, "step": 902, "think_completion_length": 59.03125 }, { "clip_ratio": 0.0, "completion_length": 143.6875, "epoch": 1.5244519392917368, "grad_norm": 17.620597318581506, "kl": 0.4287109375, "learning_rate": 6.954468802698145e-07, "loss": 0.0004, "reward": 2.8093347549438477, "reward_std": 0.08729386702179909, "rewards/final_reward": 0.4415490172857114, "rewards/mask_iou_reward": 0.2207745086428557, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8093347251415253, "rewards/thk_ans_format_reward": 1.0, "step": 903, "think_completion_length": 62.90625 }, { "clip_ratio": 0.0, "completion_length": 128.90625, "epoch": 1.5261382799325465, "grad_norm": 81.3433205071691, "kl": 0.431640625, "learning_rate": 6.951096121416526e-07, "loss": 0.0004, "reward": 3.593350052833557, "reward_std": 0.22224682942032814, "rewards/final_reward": 1.5034894963403924, "rewards/mask_iou_reward": 0.7517447481701962, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5933499336242676, "rewards/thk_ans_format_reward": 1.0, "step": 904, "think_completion_length": 56.0625 }, { "clip_ratio": 0.0, "completion_length": 165.3125, "epoch": 1.5278246205733557, "grad_norm": 4.356153558926586, "kl": 0.390625, "learning_rate": 6.947723440134907e-07, "loss": 0.0004, "reward": 3.0900958776474, "reward_std": 0.21000684797763824, "rewards/final_reward": 0.9526622565340548, "rewards/mask_iou_reward": 0.4763311282670274, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0900957882404327, "rewards/thk_ans_format_reward": 1.0, "step": 905, "think_completion_length": 64.53125 }, { "clip_ratio": 0.0, "completion_length": 127.234375, "epoch": 1.5295109612141653, "grad_norm": 11.420126935243049, "kl": 0.41796875, "learning_rate": 6.944350758853288e-07, "loss": 0.0004, "reward": 3.141783356666565, "reward_std": 0.12532974779605865, "rewards/final_reward": 1.3057637626816818, "rewards/mask_iou_reward": 0.6528818813408409, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.141783207654953, "rewards/thk_ans_format_reward": 1.0, "step": 906, "think_completion_length": 51.65625 }, { "clip_ratio": 0.0, "completion_length": 133.4375, "epoch": 1.5311973018549747, "grad_norm": 33.024581244226404, "kl": 0.443359375, "learning_rate": 6.940978077571668e-07, "loss": 0.0004, "reward": 3.2856587171554565, "reward_std": 0.23818902671337128, "rewards/final_reward": 1.5964721680485652, "rewards/mask_iou_reward": 0.7982360840242826, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2856586575508118, "rewards/thk_ans_format_reward": 1.0, "step": 907, "think_completion_length": 59.71875 }, { "clip_ratio": 0.0, "completion_length": 127.546875, "epoch": 1.5328836424957841, "grad_norm": 4.441225018652143, "kl": 0.498046875, "learning_rate": 6.93760539629005e-07, "loss": 0.0005, "reward": 3.2125693559646606, "reward_std": 0.17255272343754768, "rewards/final_reward": 1.3438855575718396, "rewards/mask_iou_reward": 0.6719427787859198, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2125694155693054, "rewards/thk_ans_format_reward": 1.0, "step": 908, "think_completion_length": 58.59375 }, { "clip_ratio": 0.0, "completion_length": 156.078125, "epoch": 1.5345699831365935, "grad_norm": 6.618112377181082, "kl": 0.3896484375, "learning_rate": 6.934232715008431e-07, "loss": 0.0004, "reward": 3.194159507751465, "reward_std": 0.2769739478826523, "rewards/final_reward": 0.8722471206493585, "rewards/mask_iou_reward": 0.43612356032467925, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1941594183444977, "rewards/thk_ans_format_reward": 1.0, "step": 909, "think_completion_length": 55.0 }, { "clip_ratio": 0.0, "completion_length": 158.296875, "epoch": 1.536256323777403, "grad_norm": 3.976682860182766, "kl": 0.431640625, "learning_rate": 6.930860033726813e-07, "loss": 0.0004, "reward": 3.5049896240234375, "reward_std": 0.09566706418991089, "rewards/final_reward": 1.4766909435750448, "rewards/mask_iou_reward": 0.7383454717875224, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5049898028373718, "rewards/thk_ans_format_reward": 1.0, "step": 910, "think_completion_length": 55.15625 }, { "clip_ratio": 0.0, "completion_length": 131.65625, "epoch": 1.5379426644182126, "grad_norm": 8.674155382800526, "kl": 0.515625, "learning_rate": 6.927487352445194e-07, "loss": 0.0005, "reward": 3.68453848361969, "reward_std": 0.3740627020597458, "rewards/final_reward": 1.6059642488093178, "rewards/mask_iou_reward": 0.8029821244046589, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6845386028289795, "rewards/thk_ans_format_reward": 1.0, "step": 911, "think_completion_length": 60.0 }, { "clip_ratio": 0.0, "completion_length": 125.828125, "epoch": 1.5396290050590218, "grad_norm": 4.201349564079745, "kl": 0.4345703125, "learning_rate": 6.924114671163575e-07, "loss": 0.0004, "reward": 2.7524369955062866, "reward_std": 0.06399728916585445, "rewards/final_reward": 0.05723391880944776, "rewards/mask_iou_reward": 0.02861695940472388, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.752437025308609, "rewards/thk_ans_format_reward": 1.0, "step": 912, "think_completion_length": 57.25 }, { "clip_ratio": 0.0, "completion_length": 113.640625, "epoch": 1.5413153456998314, "grad_norm": 15.707962181930915, "kl": 0.482421875, "learning_rate": 6.920741989881957e-07, "loss": 0.0005, "reward": 3.2202765941619873, "reward_std": 0.2580166608095169, "rewards/final_reward": 1.5070204731055663, "rewards/mask_iou_reward": 0.7535102365527832, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.220276415348053, "rewards/thk_ans_format_reward": 1.0, "step": 913, "think_completion_length": 52.65625 }, { "clip_ratio": 0.0, "completion_length": 129.234375, "epoch": 1.5430016863406408, "grad_norm": 4.612716388916718, "kl": 0.46484375, "learning_rate": 6.917369308600337e-07, "loss": 0.0005, "reward": 3.229029417037964, "reward_std": 0.2685174345970154, "rewards/final_reward": 1.2355635873006694, "rewards/mask_iou_reward": 0.6177817936503347, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.229029357433319, "rewards/thk_ans_format_reward": 1.0, "step": 914, "think_completion_length": 61.96875 }, { "clip_ratio": 0.0, "completion_length": 151.75, "epoch": 1.5446880269814502, "grad_norm": 9.71609930718754, "kl": 0.412109375, "learning_rate": 6.913996627318718e-07, "loss": 0.0004, "reward": 3.4811172485351562, "reward_std": 0.18026774376630783, "rewards/final_reward": 1.3179288333104693, "rewards/mask_iou_reward": 0.6589644166552346, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.481117308139801, "rewards/thk_ans_format_reward": 1.0, "step": 915, "think_completion_length": 58.9375 }, { "clip_ratio": 0.0, "completion_length": 178.484375, "epoch": 1.5463743676222597, "grad_norm": 6.062189232886983, "kl": 0.388671875, "learning_rate": 6.910623946037099e-07, "loss": 0.0004, "reward": 3.1265569925308228, "reward_std": 0.11232495307922363, "rewards/final_reward": 1.1716458595219756, "rewards/mask_iou_reward": 0.5858229297609878, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1265568435192108, "rewards/thk_ans_format_reward": 1.0, "step": 916, "think_completion_length": 55.21875 }, { "clip_ratio": 0.0, "completion_length": 150.46875, "epoch": 1.548060708263069, "grad_norm": 29.322825812695466, "kl": 0.40625, "learning_rate": 6.90725126475548e-07, "loss": 0.0004, "reward": 3.158001184463501, "reward_std": 0.3152724876999855, "rewards/final_reward": 1.6397486316758947, "rewards/mask_iou_reward": 0.8198743158379473, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1580011546611786, "rewards/thk_ans_format_reward": 1.0, "step": 917, "think_completion_length": 63.6875 }, { "clip_ratio": 0.0, "completion_length": 136.15625, "epoch": 1.5497470489038787, "grad_norm": 4.637874584865913, "kl": 0.431640625, "learning_rate": 6.903878583473861e-07, "loss": 0.0004, "reward": 3.1512595415115356, "reward_std": 0.05409781076014042, "rewards/final_reward": 1.146352360593079, "rewards/mask_iou_reward": 0.5731761802965395, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1512594819068909, "rewards/thk_ans_format_reward": 1.0, "step": 918, "think_completion_length": 59.03125 }, { "clip_ratio": 0.0, "completion_length": 129.0, "epoch": 1.551433389544688, "grad_norm": 7.788632543194693, "kl": 0.474609375, "learning_rate": 6.900505902192243e-07, "loss": 0.0005, "reward": 3.1258350610733032, "reward_std": 0.07057809643447399, "rewards/final_reward": 0.8358368847323898, "rewards/mask_iou_reward": 0.4179184423661949, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1258350908756256, "rewards/thk_ans_format_reward": 1.0, "step": 919, "think_completion_length": 58.28125 }, { "clip_ratio": 0.0, "completion_length": 128.0625, "epoch": 1.5531197301854975, "grad_norm": 5.97941155669541, "kl": 0.4501953125, "learning_rate": 6.897133220910624e-07, "loss": 0.0004, "reward": 3.0851335525512695, "reward_std": 0.22871796786785126, "rewards/final_reward": 1.0473059909656413, "rewards/mask_iou_reward": 0.5236529954828206, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0851335227489471, "rewards/thk_ans_format_reward": 1.0, "step": 920, "think_completion_length": 55.21875 }, { "clip_ratio": 0.0, "completion_length": 136.640625, "epoch": 1.554806070826307, "grad_norm": 4.936714793425599, "kl": 0.4384765625, "learning_rate": 6.893760539629005e-07, "loss": 0.0004, "reward": 3.1050972938537598, "reward_std": 0.12275232374668121, "rewards/final_reward": 1.113718007247768, "rewards/mask_iou_reward": 0.556859003623884, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1050972640514374, "rewards/thk_ans_format_reward": 1.0, "step": 921, "think_completion_length": 54.875 }, { "clip_ratio": 0.0, "completion_length": 144.25, "epoch": 1.5564924114671164, "grad_norm": 10.771346363486499, "kl": 0.443359375, "learning_rate": 6.890387858347387e-07, "loss": 0.0004, "reward": 3.220325231552124, "reward_std": 0.22513248771429062, "rewards/final_reward": 1.4726670164797664, "rewards/mask_iou_reward": 0.7363335082398832, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.220325231552124, "rewards/thk_ans_format_reward": 1.0, "step": 922, "think_completion_length": 54.09375 }, { "clip_ratio": 0.0, "completion_length": 151.03125, "epoch": 1.5581787521079258, "grad_norm": 6.072745008400607, "kl": 0.4453125, "learning_rate": 6.887015177065767e-07, "loss": 0.0004, "reward": 2.69256329536438, "reward_std": 0.2358478605747223, "rewards/final_reward": 1.005379959545185, "rewards/mask_iou_reward": 0.5026899797725926, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.708188384771347, "rewards/thk_ans_format_reward": 1.0, "step": 923, "think_completion_length": 58.28125 }, { "clip_ratio": 0.0, "completion_length": 127.296875, "epoch": 1.5598650927487352, "grad_norm": 5.613134816077137, "kl": 0.462890625, "learning_rate": 6.883642495784147e-07, "loss": 0.0005, "reward": 3.6800777912139893, "reward_std": 0.07642281427979469, "rewards/final_reward": 1.8789947092503736, "rewards/mask_iou_reward": 0.9394973546251868, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6800777912139893, "rewards/thk_ans_format_reward": 1.0, "step": 924, "think_completion_length": 60.125 }, { "clip_ratio": 0.0, "completion_length": 139.125, "epoch": 1.5615514333895448, "grad_norm": 5.487692228810072, "kl": 0.4228515625, "learning_rate": 6.880269814502529e-07, "loss": 0.0004, "reward": 2.868987202644348, "reward_std": 0.18398159742355347, "rewards/final_reward": 0.4337001279491935, "rewards/mask_iou_reward": 0.21685006397459675, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8689871728420258, "rewards/thk_ans_format_reward": 1.0, "step": 925, "think_completion_length": 63.03125 }, { "clip_ratio": 0.0, "completion_length": 135.59375, "epoch": 1.563237774030354, "grad_norm": 8.794735029230512, "kl": 0.453125, "learning_rate": 6.87689713322091e-07, "loss": 0.0005, "reward": 3.198965072631836, "reward_std": 0.21983014419674873, "rewards/final_reward": 1.1240214130545105, "rewards/mask_iou_reward": 0.5620107065272553, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1989650130271912, "rewards/thk_ans_format_reward": 1.0, "step": 926, "think_completion_length": 59.84375 }, { "clip_ratio": 0.0, "completion_length": 124.671875, "epoch": 1.5649241146711637, "grad_norm": 12.984976043343616, "kl": 0.521484375, "learning_rate": 6.873524451939291e-07, "loss": 0.0005, "reward": 3.4763259887695312, "reward_std": 0.15147725492715836, "rewards/final_reward": 1.8704005807661068, "rewards/mask_iou_reward": 0.9352002903830534, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4763262271881104, "rewards/thk_ans_format_reward": 1.0, "step": 927, "think_completion_length": 54.125 }, { "clip_ratio": 0.0, "completion_length": 160.0, "epoch": 1.566610455311973, "grad_norm": 5.291031497794727, "kl": 0.470703125, "learning_rate": 6.870151770657673e-07, "loss": 0.0005, "reward": 3.0472246408462524, "reward_std": 0.371063232421875, "rewards/final_reward": 1.653832077696684, "rewards/mask_iou_reward": 0.826916038848342, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0472246408462524, "rewards/thk_ans_format_reward": 1.0, "step": 928, "think_completion_length": 50.34375 }, { "clip_ratio": 0.0, "completion_length": 129.71875, "epoch": 1.5682967959527825, "grad_norm": 5.172504836959009, "kl": 0.4326171875, "learning_rate": 6.866779089376054e-07, "loss": 0.0004, "reward": 2.9630849361419678, "reward_std": 0.03674542997032404, "rewards/final_reward": 0.9665219127822793, "rewards/mask_iou_reward": 0.48326095639113964, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.963085075840354, "rewards/thk_ans_format_reward": 1.0, "step": 929, "think_completion_length": 55.03125 }, { "clip_ratio": 0.0, "completion_length": 137.78125, "epoch": 1.569983136593592, "grad_norm": 6.651356452309486, "kl": 0.4130859375, "learning_rate": 6.863406408094435e-07, "loss": 0.0004, "reward": 3.1770557165145874, "reward_std": 0.14116919820662588, "rewards/final_reward": 1.7948890625633755, "rewards/mask_iou_reward": 0.8974445312816878, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1770557463169098, "rewards/thk_ans_format_reward": 1.0, "step": 930, "think_completion_length": 55.375 }, { "clip_ratio": 0.0, "completion_length": 140.4375, "epoch": 1.5716694772344013, "grad_norm": 8.669947318491714, "kl": 0.42578125, "learning_rate": 6.860033726812817e-07, "loss": 0.0004, "reward": 3.5581700801849365, "reward_std": 0.10219700261950493, "rewards/final_reward": 1.5715215638037199, "rewards/mask_iou_reward": 0.7857607819018599, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5581701397895813, "rewards/thk_ans_format_reward": 1.0, "step": 931, "think_completion_length": 64.65625 }, { "clip_ratio": 0.0, "completion_length": 126.140625, "epoch": 1.573355817875211, "grad_norm": 16.21093382898676, "kl": 0.4111328125, "learning_rate": 6.856661045531196e-07, "loss": 0.0004, "reward": 2.934127926826477, "reward_std": 0.17660583928227425, "rewards/final_reward": 0.6131344617154153, "rewards/mask_iou_reward": 0.30656723085770765, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9341279566287994, "rewards/thk_ans_format_reward": 1.0, "step": 932, "think_completion_length": 56.5 }, { "clip_ratio": 0.0, "completion_length": 124.671875, "epoch": 1.5750421585160201, "grad_norm": 17.569316127594202, "kl": 0.4677734375, "learning_rate": 6.853288364249577e-07, "loss": 0.0005, "reward": 3.4104976654052734, "reward_std": 0.17937590926885605, "rewards/final_reward": 1.079353207141879, "rewards/mask_iou_reward": 0.5396766035709395, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4104976654052734, "rewards/thk_ans_format_reward": 1.0, "step": 933, "think_completion_length": 56.84375 }, { "clip_ratio": 0.0, "completion_length": 134.46875, "epoch": 1.5767284991568298, "grad_norm": 36.84957525298316, "kl": 0.4111328125, "learning_rate": 6.849915682967959e-07, "loss": 0.0004, "reward": 3.0988067388534546, "reward_std": 0.084061773493886, "rewards/final_reward": 1.296683767640328, "rewards/mask_iou_reward": 0.648341883820164, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0988066494464874, "rewards/thk_ans_format_reward": 1.0, "step": 934, "think_completion_length": 54.125 }, { "clip_ratio": 0.0, "completion_length": 136.15625, "epoch": 1.578414839797639, "grad_norm": 7.168885477010204, "kl": 0.41796875, "learning_rate": 6.84654300168634e-07, "loss": 0.0004, "reward": 2.9668532609939575, "reward_std": 0.15460747107863426, "rewards/final_reward": 0.9098481944969506, "rewards/mask_iou_reward": 0.4549240972484753, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9668533802032471, "rewards/thk_ans_format_reward": 1.0, "step": 935, "think_completion_length": 54.78125 }, { "clip_ratio": 0.0, "completion_length": 144.3125, "epoch": 1.5801011804384486, "grad_norm": 10.38616655853377, "kl": 0.41015625, "learning_rate": 6.843170320404722e-07, "loss": 0.0004, "reward": 3.1787116527557373, "reward_std": 0.22786857932806015, "rewards/final_reward": 1.3526219058215783, "rewards/mask_iou_reward": 0.6763109529107891, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.178711622953415, "rewards/thk_ans_format_reward": 1.0, "step": 936, "think_completion_length": 58.6875 }, { "clip_ratio": 0.0, "completion_length": 125.3125, "epoch": 1.581787521079258, "grad_norm": 15.467819146162833, "kl": 0.462890625, "learning_rate": 6.839797639123103e-07, "loss": 0.0005, "reward": 2.5192718505859375, "reward_std": 0.08620522171258926, "rewards/final_reward": 0.8012095167810038, "rewards/mask_iou_reward": 0.4006047583905019, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5192718356847763, "rewards/thk_ans_format_reward": 1.0, "step": 937, "think_completion_length": 56.625 }, { "clip_ratio": 0.0, "completion_length": 121.703125, "epoch": 1.5834738617200674, "grad_norm": 6.545789979099022, "kl": 0.51953125, "learning_rate": 6.836424957841484e-07, "loss": 0.0005, "reward": 3.624456763267517, "reward_std": 0.041466801427304745, "rewards/final_reward": 1.7924130592935685, "rewards/mask_iou_reward": 0.8962065296467843, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6244568228721619, "rewards/thk_ans_format_reward": 1.0, "step": 938, "think_completion_length": 50.0 }, { "clip_ratio": 0.0, "completion_length": 126.296875, "epoch": 1.5851602023608768, "grad_norm": 4.387297016647714, "kl": 0.421875, "learning_rate": 6.833052276559866e-07, "loss": 0.0004, "reward": 3.1855965852737427, "reward_std": 0.06531479209661484, "rewards/final_reward": 1.4869811079802981, "rewards/mask_iou_reward": 0.7434905539901491, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.185596525669098, "rewards/thk_ans_format_reward": 1.0, "step": 939, "think_completion_length": 52.6875 }, { "clip_ratio": 0.0, "completion_length": 124.671875, "epoch": 1.5868465430016863, "grad_norm": 9.437264321195768, "kl": 0.40625, "learning_rate": 6.829679595278247e-07, "loss": 0.0004, "reward": 3.1889456510543823, "reward_std": 0.3177672028541565, "rewards/final_reward": 1.5420257945976152, "rewards/mask_iou_reward": 0.7710128972988076, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.2045705914497375, "rewards/thk_ans_format_reward": 1.0, "step": 940, "think_completion_length": 54.09375 }, { "clip_ratio": 0.0, "completion_length": 126.640625, "epoch": 1.588532883642496, "grad_norm": 6.655029831675907, "kl": 0.439453125, "learning_rate": 6.826306913996626e-07, "loss": 0.0004, "reward": 2.6530632972717285, "reward_std": 0.3725260943174362, "rewards/final_reward": 0.7438548296706913, "rewards/mask_iou_reward": 0.37192741483534564, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6530634164810181, "rewards/thk_ans_format_reward": 1.0, "step": 941, "think_completion_length": 59.84375 }, { "clip_ratio": 0.0, "completion_length": 129.421875, "epoch": 1.590219224283305, "grad_norm": 22.97720761020874, "kl": 0.4091796875, "learning_rate": 6.822934232715008e-07, "loss": 0.0004, "reward": 2.9522364139556885, "reward_std": 0.23722931742668152, "rewards/final_reward": 0.7601074556303956, "rewards/mask_iou_reward": 0.3800537278151978, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9522364139556885, "rewards/thk_ans_format_reward": 1.0, "step": 942, "think_completion_length": 54.53125 }, { "clip_ratio": 0.0, "completion_length": 125.078125, "epoch": 1.5919055649241147, "grad_norm": 4.557053886602248, "kl": 0.4296875, "learning_rate": 6.819561551433389e-07, "loss": 0.0004, "reward": 3.2444454431533813, "reward_std": 0.09333648579195142, "rewards/final_reward": 0.6803116547975444, "rewards/mask_iou_reward": 0.3401558273987722, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2444454729557037, "rewards/thk_ans_format_reward": 1.0, "step": 943, "think_completion_length": 53.75 }, { "clip_ratio": 0.0, "completion_length": 152.265625, "epoch": 1.5935919055649241, "grad_norm": 6.630359254295777, "kl": 0.4296875, "learning_rate": 6.81618887015177e-07, "loss": 0.0004, "reward": 3.813447594642639, "reward_std": 0.13165267184376717, "rewards/final_reward": 1.8897588433100132, "rewards/mask_iou_reward": 0.9448794216550066, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.813447654247284, "rewards/thk_ans_format_reward": 1.0, "step": 944, "think_completion_length": 54.40625 }, { "clip_ratio": 0.0, "completion_length": 165.65625, "epoch": 1.5952782462057336, "grad_norm": 6.296176030816266, "kl": 0.40625, "learning_rate": 6.812816188870152e-07, "loss": 0.0004, "reward": 3.185824394226074, "reward_std": 0.1516023352742195, "rewards/final_reward": 0.5434738576778565, "rewards/mask_iou_reward": 0.27173692883892825, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1858242750167847, "rewards/thk_ans_format_reward": 1.0, "step": 945, "think_completion_length": 57.1875 }, { "clip_ratio": 0.0, "completion_length": 111.96875, "epoch": 1.596964586846543, "grad_norm": 7.362326699822068, "kl": 0.50390625, "learning_rate": 6.809443507588533e-07, "loss": 0.0005, "reward": 3.2611594200134277, "reward_std": 0.16684667952358723, "rewards/final_reward": 0.858870452074543, "rewards/mask_iou_reward": 0.4294352260372715, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2611593306064606, "rewards/thk_ans_format_reward": 1.0, "step": 946, "think_completion_length": 53.03125 }, { "clip_ratio": 0.0, "completion_length": 134.78125, "epoch": 1.5986509274873524, "grad_norm": 10.544323839047761, "kl": 0.4716796875, "learning_rate": 6.806070826306914e-07, "loss": 0.0005, "reward": 2.8881239891052246, "reward_std": 0.09691545739769936, "rewards/final_reward": 0.4005539385653958, "rewards/mask_iou_reward": 0.2002769692826979, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8881239891052246, "rewards/thk_ans_format_reward": 1.0, "step": 947, "think_completion_length": 50.4375 }, { "clip_ratio": 0.0, "completion_length": 154.265625, "epoch": 1.600337268128162, "grad_norm": 6.359182683112394, "kl": 0.3974609375, "learning_rate": 6.802698145025296e-07, "loss": 0.0004, "reward": 3.300767660140991, "reward_std": 0.21483464539051056, "rewards/final_reward": 1.1850023478592198, "rewards/mask_iou_reward": 0.5925011739296099, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3007675409317017, "rewards/thk_ans_format_reward": 1.0, "step": 948, "think_completion_length": 48.34375 }, { "clip_ratio": 0.0, "completion_length": 151.21875, "epoch": 1.6020236087689712, "grad_norm": 5.49566363170077, "kl": 0.41796875, "learning_rate": 6.799325463743675e-07, "loss": 0.0004, "reward": 3.170168161392212, "reward_std": 0.19993770122528076, "rewards/final_reward": 1.6430170837061473, "rewards/mask_iou_reward": 0.8215085418530736, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.170168161392212, "rewards/thk_ans_format_reward": 1.0, "step": 949, "think_completion_length": 49.9375 }, { "clip_ratio": 0.0, "completion_length": 127.75, "epoch": 1.6037099494097808, "grad_norm": 13.203439988715884, "kl": 0.505859375, "learning_rate": 6.795952782462056e-07, "loss": 0.0005, "reward": 2.7104690074920654, "reward_std": 0.38839419186115265, "rewards/final_reward": 0.6529316778986988, "rewards/mask_iou_reward": 0.3264658389493494, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.7260940372943878, "rewards/thk_ans_format_reward": 1.0, "step": 950, "think_completion_length": 53.90625 }, { "clip_ratio": 0.0, "completion_length": 140.09375, "epoch": 1.6053962900505903, "grad_norm": 13.01189056246978, "kl": 0.4462890625, "learning_rate": 6.792580101180438e-07, "loss": 0.0004, "reward": 2.8600813150405884, "reward_std": 0.17967913672327995, "rewards/final_reward": 1.1577608656838663, "rewards/mask_iou_reward": 0.5788804328419331, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.8913313150405884, "rewards/thk_ans_format_reward": 0.984375, "step": 951, "think_completion_length": 53.90625 }, { "clip_ratio": 0.0, "completion_length": 135.03125, "epoch": 1.6070826306913997, "grad_norm": 5.747511025787855, "kl": 0.49609375, "learning_rate": 6.789207419898819e-07, "loss": 0.0005, "reward": 3.1585750579833984, "reward_std": 0.23563334345817566, "rewards/final_reward": 0.6743248054644104, "rewards/mask_iou_reward": 0.3371624027322052, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1585749387741089, "rewards/thk_ans_format_reward": 1.0, "step": 952, "think_completion_length": 53.375 }, { "clip_ratio": 0.0, "completion_length": 127.78125, "epoch": 1.608768971332209, "grad_norm": 8.223043903697763, "kl": 0.50390625, "learning_rate": 6.7858347386172e-07, "loss": 0.0005, "reward": 3.6050385236740112, "reward_std": 0.14627795293927193, "rewards/final_reward": 1.7853409096681372, "rewards/mask_iou_reward": 0.8926704548340686, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6050386428833008, "rewards/thk_ans_format_reward": 1.0, "step": 953, "think_completion_length": 60.78125 }, { "clip_ratio": 0.0, "completion_length": 154.234375, "epoch": 1.6104553119730185, "grad_norm": 7.414075157118034, "kl": 0.4140625, "learning_rate": 6.782462057335582e-07, "loss": 0.0004, "reward": 2.9460577964782715, "reward_std": 0.1564902514219284, "rewards/final_reward": 0.6832645173258263, "rewards/mask_iou_reward": 0.34163225866291314, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9460577964782715, "rewards/thk_ans_format_reward": 1.0, "step": 954, "think_completion_length": 63.09375 }, { "clip_ratio": 0.0, "completion_length": 125.5625, "epoch": 1.6121416526138281, "grad_norm": 11.447181386646438, "kl": 0.49609375, "learning_rate": 6.779089376053963e-07, "loss": 0.0005, "reward": 3.4335960149765015, "reward_std": 0.270451620221138, "rewards/final_reward": 1.47075722453424, "rewards/mask_iou_reward": 0.73537861226712, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.433595895767212, "rewards/thk_ans_format_reward": 1.0, "step": 955, "think_completion_length": 61.25 }, { "clip_ratio": 0.0, "completion_length": 132.921875, "epoch": 1.6138279932546373, "grad_norm": 18.76508612601952, "kl": 0.44921875, "learning_rate": 6.775716694772344e-07, "loss": 0.0004, "reward": 3.172788619995117, "reward_std": 0.12457035994157195, "rewards/final_reward": 0.7677162580570118, "rewards/mask_iou_reward": 0.3838581290285059, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.172788679599762, "rewards/thk_ans_format_reward": 1.0, "step": 956, "think_completion_length": 50.375 }, { "clip_ratio": 0.0, "completion_length": 135.9375, "epoch": 1.615514333895447, "grad_norm": 7.851729484504201, "kl": 0.4345703125, "learning_rate": 6.772344013490725e-07, "loss": 0.0004, "reward": 2.9846887588500977, "reward_std": 0.2687959522008896, "rewards/final_reward": 0.6911135867581653, "rewards/mask_iou_reward": 0.3455567933790826, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9846886992454529, "rewards/thk_ans_format_reward": 1.0, "step": 957, "think_completion_length": 57.1875 }, { "clip_ratio": 0.0, "completion_length": 168.25, "epoch": 1.6172006745362564, "grad_norm": 5.848554755465288, "kl": 0.439453125, "learning_rate": 6.768971332209105e-07, "loss": 0.0004, "reward": 3.0769588947296143, "reward_std": 0.11406697146594524, "rewards/final_reward": 1.6693445781968697, "rewards/mask_iou_reward": 0.8346722890984348, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0769589841365814, "rewards/thk_ans_format_reward": 1.0, "step": 958, "think_completion_length": 49.84375 }, { "clip_ratio": 0.0, "completion_length": 140.765625, "epoch": 1.6188870151770658, "grad_norm": 4.807242982976459, "kl": 0.4140625, "learning_rate": 6.765598650927486e-07, "loss": 0.0004, "reward": 3.054129123687744, "reward_std": 0.19870612863451242, "rewards/final_reward": 1.0483865036756663, "rewards/mask_iou_reward": 0.5241932518378332, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.0697540044784546, "rewards/thk_ans_format_reward": 1.0, "step": 959, "think_completion_length": 52.90625 }, { "clip_ratio": 0.0, "completion_length": 122.796875, "epoch": 1.6205733558178752, "grad_norm": 4.898922143259208, "kl": 0.423828125, "learning_rate": 6.762225969645868e-07, "loss": 0.0004, "reward": 3.6588209867477417, "reward_std": 0.04261109419167042, "rewards/final_reward": 1.5577609229973175, "rewards/mask_iou_reward": 0.7788804614986587, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6588210463523865, "rewards/thk_ans_format_reward": 1.0, "step": 960, "think_completion_length": 50.5625 }, { "clip_ratio": 0.0, "completion_length": 125.625, "epoch": 1.6222596964586846, "grad_norm": 14.867085837269215, "kl": 0.474609375, "learning_rate": 6.758853288364249e-07, "loss": 0.0005, "reward": 3.124504804611206, "reward_std": 0.12859491258859634, "rewards/final_reward": 0.8346335177809561, "rewards/mask_iou_reward": 0.41731675889047803, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1245048642158508, "rewards/thk_ans_format_reward": 1.0, "step": 961, "think_completion_length": 52.75 }, { "clip_ratio": 0.0, "completion_length": 138.265625, "epoch": 1.6239460370994943, "grad_norm": 5.2997309744790035, "kl": 0.4267578125, "learning_rate": 6.755480607082631e-07, "loss": 0.0004, "reward": 3.4570316076278687, "reward_std": 0.05912900622934103, "rewards/final_reward": 1.0982445093578157, "rewards/mask_iou_reward": 0.5491222546789079, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.457031488418579, "rewards/thk_ans_format_reward": 1.0, "step": 962, "think_completion_length": 54.8125 }, { "clip_ratio": 0.0, "completion_length": 124.171875, "epoch": 1.6256323777403034, "grad_norm": 5.236334173922632, "kl": 0.4912109375, "learning_rate": 6.752107925801012e-07, "loss": 0.0003, "reward": 3.1656047105789185, "reward_std": 0.09970117919147015, "rewards/final_reward": 0.9881815716090991, "rewards/mask_iou_reward": 0.49409078580454957, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1656047105789185, "rewards/thk_ans_format_reward": 1.0, "step": 963, "think_completion_length": 53.40625 }, { "clip_ratio": 0.0, "completion_length": 143.328125, "epoch": 1.627318718381113, "grad_norm": 8.80017290037366, "kl": 0.4306640625, "learning_rate": 6.748735244519393e-07, "loss": 0.0004, "reward": 3.161680221557617, "reward_std": 0.1751057505607605, "rewards/final_reward": 1.6679947373879087, "rewards/mask_iou_reward": 0.8339973686939544, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.1773052215576172, "rewards/thk_ans_format_reward": 1.0, "step": 964, "think_completion_length": 57.03125 }, { "clip_ratio": 0.0, "completion_length": 128.21875, "epoch": 1.6290050590219223, "grad_norm": 9.93252740921444, "kl": 0.458984375, "learning_rate": 6.745362563237775e-07, "loss": 0.0005, "reward": 3.409914016723633, "reward_std": 0.31119733303785324, "rewards/final_reward": 1.532120199761723, "rewards/mask_iou_reward": 0.7660600998808615, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.4255390763282776, "rewards/thk_ans_format_reward": 1.0, "step": 965, "think_completion_length": 52.96875 }, { "clip_ratio": 0.0, "completion_length": 164.234375, "epoch": 1.630691399662732, "grad_norm": 16.77150015777779, "kl": 0.55859375, "learning_rate": 6.741989881956155e-07, "loss": 0.0006, "reward": 3.312274217605591, "reward_std": 0.14685458689928055, "rewards/final_reward": 1.7361032842907864, "rewards/mask_iou_reward": 0.8680516421453932, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3122743964195251, "rewards/thk_ans_format_reward": 1.0, "step": 966, "think_completion_length": 53.21875 }, { "clip_ratio": 0.0, "completion_length": 142.09375, "epoch": 1.6323777403035413, "grad_norm": 26.952546856783577, "kl": 0.443359375, "learning_rate": 6.738617200674535e-07, "loss": 0.0004, "reward": 3.182400941848755, "reward_std": 0.2391754314303398, "rewards/final_reward": 1.218758984763645, "rewards/mask_iou_reward": 0.6093794923818225, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1824010014533997, "rewards/thk_ans_format_reward": 1.0, "step": 967, "think_completion_length": 57.0 }, { "clip_ratio": 0.0, "completion_length": 129.859375, "epoch": 1.6340640809443507, "grad_norm": 10.164153327570757, "kl": 0.45703125, "learning_rate": 6.735244519392917e-07, "loss": 0.0005, "reward": 3.2615323066711426, "reward_std": 0.22615301050245762, "rewards/final_reward": 1.009834806795975, "rewards/mask_iou_reward": 0.5049174033979875, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2615323066711426, "rewards/thk_ans_format_reward": 1.0, "step": 968, "think_completion_length": 62.625 }, { "clip_ratio": 0.0, "completion_length": 137.25, "epoch": 1.6357504215851602, "grad_norm": 5.7572682275727995, "kl": 0.46875, "learning_rate": 6.731871838111298e-07, "loss": 0.0005, "reward": 3.587040066719055, "reward_std": 0.24026421457529068, "rewards/final_reward": 1.3034248229880045, "rewards/mask_iou_reward": 0.6517124114940023, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5870400071144104, "rewards/thk_ans_format_reward": 1.0, "step": 969, "think_completion_length": 52.78125 }, { "clip_ratio": 0.0, "completion_length": 124.4375, "epoch": 1.6374367622259696, "grad_norm": 8.103092577365828, "kl": 0.470703125, "learning_rate": 6.728499156829679e-07, "loss": 0.0005, "reward": 2.8985953330993652, "reward_std": 0.3215479403734207, "rewards/final_reward": 0.6277226580931698, "rewards/mask_iou_reward": 0.3138613290465849, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8985952436923981, "rewards/thk_ans_format_reward": 1.0, "step": 970, "think_completion_length": 56.46875 }, { "clip_ratio": 0.0, "completion_length": 117.84375, "epoch": 1.6391231028667792, "grad_norm": 7.764069334142484, "kl": 0.537109375, "learning_rate": 6.725126475548061e-07, "loss": 0.0005, "reward": 3.494862914085388, "reward_std": 0.044202063232660294, "rewards/final_reward": 1.3648452716227641, "rewards/mask_iou_reward": 0.6824226358113821, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4948627352714539, "rewards/thk_ans_format_reward": 1.0, "step": 971, "think_completion_length": 50.25 }, { "clip_ratio": 0.0, "completion_length": 152.75, "epoch": 1.6408094435075884, "grad_norm": 7.854660442406088, "kl": 0.525390625, "learning_rate": 6.721753794266442e-07, "loss": 0.0005, "reward": 2.56212317943573, "reward_std": 0.1159486398100853, "rewards/final_reward": 1.0332222703548855, "rewards/mask_iou_reward": 0.5166111351774427, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5621231943368912, "rewards/thk_ans_format_reward": 1.0, "step": 972, "think_completion_length": 57.1875 }, { "clip_ratio": 0.0, "completion_length": 146.984375, "epoch": 1.642495784148398, "grad_norm": 5.779886052799433, "kl": 0.4365234375, "learning_rate": 6.718381112984823e-07, "loss": 0.0004, "reward": 3.4496811628341675, "reward_std": 0.07857851311564445, "rewards/final_reward": 1.3060323556255151, "rewards/mask_iou_reward": 0.6530161778127576, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4496811628341675, "rewards/thk_ans_format_reward": 1.0, "step": 973, "think_completion_length": 52.0625 }, { "clip_ratio": 0.0, "completion_length": 142.0, "epoch": 1.6441821247892074, "grad_norm": 197.52017044670364, "kl": 0.4443359375, "learning_rate": 6.715008431703204e-07, "loss": 0.0004, "reward": 3.357508063316345, "reward_std": 0.13863565400242805, "rewards/final_reward": 1.5934704967616962, "rewards/mask_iou_reward": 0.7967352483808481, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3575079441070557, "rewards/thk_ans_format_reward": 1.0, "step": 974, "think_completion_length": 52.25 }, { "clip_ratio": 0.0, "completion_length": 132.296875, "epoch": 1.6458684654300169, "grad_norm": 13.268391687877173, "kl": 0.552734375, "learning_rate": 6.711635750421585e-07, "loss": 0.0006, "reward": 3.3350095748901367, "reward_std": 0.19416646659374237, "rewards/final_reward": 1.1799294280478632, "rewards/mask_iou_reward": 0.5899647140239316, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.335009515285492, "rewards/thk_ans_format_reward": 1.0, "step": 975, "think_completion_length": 60.96875 }, { "clip_ratio": 0.0, "completion_length": 123.671875, "epoch": 1.6475548060708263, "grad_norm": 5.910386625262815, "kl": 0.43359375, "learning_rate": 6.708263069139965e-07, "loss": 0.0004, "reward": 3.4555543661117554, "reward_std": 0.04451208934187889, "rewards/final_reward": 1.316332445803982, "rewards/mask_iou_reward": 0.658166222901991, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4555545449256897, "rewards/thk_ans_format_reward": 1.0, "step": 976, "think_completion_length": 53.3125 }, { "clip_ratio": 0.0, "completion_length": 156.015625, "epoch": 1.6492411467116357, "grad_norm": 5.538920499109146, "kl": 0.46875, "learning_rate": 6.704890387858347e-07, "loss": 0.0005, "reward": 2.931230068206787, "reward_std": 0.12052519991993904, "rewards/final_reward": 0.8966643714602006, "rewards/mask_iou_reward": 0.4483321857301003, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.9468550682067871, "rewards/thk_ans_format_reward": 1.0, "step": 977, "think_completion_length": 60.28125 }, { "clip_ratio": 0.0, "completion_length": 113.921875, "epoch": 1.6509274873524453, "grad_norm": 7.054514546876495, "kl": 0.470703125, "learning_rate": 6.701517706576728e-07, "loss": 0.0005, "reward": 3.2970367670059204, "reward_std": 0.3092042412608862, "rewards/final_reward": 1.645884152817545, "rewards/mask_iou_reward": 0.8229420764087725, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2970367670059204, "rewards/thk_ans_format_reward": 1.0, "step": 978, "think_completion_length": 54.5625 }, { "clip_ratio": 0.0, "completion_length": 123.546875, "epoch": 1.6526138279932545, "grad_norm": 8.006301481962957, "kl": 0.4638671875, "learning_rate": 6.698145025295109e-07, "loss": 0.0005, "reward": 3.2908164262771606, "reward_std": 0.1687500774860382, "rewards/final_reward": 0.8286613290487709, "rewards/mask_iou_reward": 0.41433066452438544, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2908163666725159, "rewards/thk_ans_format_reward": 1.0, "step": 979, "think_completion_length": 51.1875 }, { "clip_ratio": 0.0, "completion_length": 122.953125, "epoch": 1.6543001686340641, "grad_norm": 3.8420590887205512, "kl": 0.458984375, "learning_rate": 6.694772344013491e-07, "loss": 0.0005, "reward": 3.4131717681884766, "reward_std": 0.06533291470259428, "rewards/final_reward": 1.1531764423339341, "rewards/mask_iou_reward": 0.5765882211669671, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4131719470024109, "rewards/thk_ans_format_reward": 1.0, "step": 980, "think_completion_length": 47.40625 }, { "clip_ratio": 0.0, "completion_length": 123.53125, "epoch": 1.6559865092748736, "grad_norm": 5.036403987186729, "kl": 0.48828125, "learning_rate": 6.691399662731872e-07, "loss": 0.0005, "reward": 3.548070192337036, "reward_std": 0.2200283706188202, "rewards/final_reward": 1.390233837634983, "rewards/mask_iou_reward": 0.6951169188174915, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5480701923370361, "rewards/thk_ans_format_reward": 1.0, "step": 981, "think_completion_length": 54.09375 }, { "clip_ratio": 0.0, "completion_length": 191.109375, "epoch": 1.657672849915683, "grad_norm": 7.029428131198568, "kl": 0.412109375, "learning_rate": 6.688026981450252e-07, "loss": 0.0004, "reward": 3.329420566558838, "reward_std": 0.08415714651346207, "rewards/final_reward": 1.756476899069994, "rewards/mask_iou_reward": 0.878238449534997, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3294204473495483, "rewards/thk_ans_format_reward": 1.0, "step": 982, "think_completion_length": 50.40625 }, { "clip_ratio": 0.0, "completion_length": 120.125, "epoch": 1.6593591905564924, "grad_norm": 8.335193513559757, "kl": 0.5380859375, "learning_rate": 6.684654300168634e-07, "loss": 0.0005, "reward": 3.044628858566284, "reward_std": 0.12631105724722147, "rewards/final_reward": 1.3542042297420498, "rewards/mask_iou_reward": 0.6771021148710249, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0446289479732513, "rewards/thk_ans_format_reward": 1.0, "step": 983, "think_completion_length": 45.03125 }, { "clip_ratio": 0.0, "completion_length": 141.734375, "epoch": 1.6610455311973018, "grad_norm": 5.917140641040661, "kl": 0.4580078125, "learning_rate": 6.681281618887014e-07, "loss": 0.0005, "reward": 3.469908595085144, "reward_std": 0.2401201520115137, "rewards/final_reward": 1.538840379611916, "rewards/mask_iou_reward": 0.769420189805958, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.485533595085144, "rewards/thk_ans_format_reward": 1.0, "step": 984, "think_completion_length": 55.6875 }, { "clip_ratio": 0.0, "completion_length": 147.421875, "epoch": 1.6627318718381114, "grad_norm": 12.243138457321537, "kl": 0.455078125, "learning_rate": 6.677908937605396e-07, "loss": 0.0005, "reward": 2.940201163291931, "reward_std": 0.31569964066147804, "rewards/final_reward": 0.9419295381271846, "rewards/mask_iou_reward": 0.4709647690635923, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.9558261930942535, "rewards/thk_ans_format_reward": 1.0, "step": 985, "think_completion_length": 50.0625 }, { "clip_ratio": 0.0, "completion_length": 127.015625, "epoch": 1.6644182124789206, "grad_norm": 5.822222361871982, "kl": 0.509765625, "learning_rate": 6.674536256323777e-07, "loss": 0.0005, "reward": 3.062288284301758, "reward_std": 0.13281331211328506, "rewards/final_reward": 1.1950692639241447, "rewards/mask_iou_reward": 0.5975346319620723, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0622882843017578, "rewards/thk_ans_format_reward": 1.0, "step": 986, "think_completion_length": 56.1875 }, { "clip_ratio": 0.0, "completion_length": 142.109375, "epoch": 1.6661045531197303, "grad_norm": 6.756491763935447, "kl": 0.474609375, "learning_rate": 6.671163575042158e-07, "loss": 0.0005, "reward": 2.8552377223968506, "reward_std": 0.24670583754777908, "rewards/final_reward": 0.6170252546302251, "rewards/mask_iou_reward": 0.30851262731511253, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.8708627223968506, "rewards/thk_ans_format_reward": 1.0, "step": 987, "think_completion_length": 52.90625 }, { "clip_ratio": 0.0, "completion_length": 124.578125, "epoch": 1.6677908937605397, "grad_norm": 6.603201839448816, "kl": 0.443359375, "learning_rate": 6.66779089376054e-07, "loss": 0.0004, "reward": 3.013177275657654, "reward_std": 0.22320347279310226, "rewards/final_reward": 0.7690452230863198, "rewards/mask_iou_reward": 0.3845226115431599, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0131773054599762, "rewards/thk_ans_format_reward": 1.0, "step": 988, "think_completion_length": 50.84375 }, { "clip_ratio": 0.0, "completion_length": 150.125, "epoch": 1.669477234401349, "grad_norm": 6.814146979881974, "kl": 0.52734375, "learning_rate": 6.664418212478921e-07, "loss": 0.0005, "reward": 3.1289366483688354, "reward_std": 0.22629151493310928, "rewards/final_reward": 1.236931896683771, "rewards/mask_iou_reward": 0.6184659483418855, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.1445615887641907, "rewards/thk_ans_format_reward": 1.0, "step": 989, "think_completion_length": 46.96875 }, { "clip_ratio": 0.0, "completion_length": 138.390625, "epoch": 1.6711635750421585, "grad_norm": 7.369379403584672, "kl": 0.4375, "learning_rate": 6.661045531197301e-07, "loss": 0.0004, "reward": 3.5929524898529053, "reward_std": 0.19491755589842796, "rewards/final_reward": 1.80680461215357, "rewards/mask_iou_reward": 0.903402306076785, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5929523706436157, "rewards/thk_ans_format_reward": 1.0, "step": 990, "think_completion_length": 55.53125 }, { "clip_ratio": 0.0, "completion_length": 123.140625, "epoch": 1.672849915682968, "grad_norm": 8.968397003633898, "kl": 0.458984375, "learning_rate": 6.657672849915683e-07, "loss": 0.0005, "reward": 3.1989855766296387, "reward_std": 0.4630395621061325, "rewards/final_reward": 0.8420022603292938, "rewards/mask_iou_reward": 0.4210011301646469, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.2146106958389282, "rewards/thk_ans_format_reward": 1.0, "step": 991, "think_completion_length": 56.6875 }, { "clip_ratio": 0.0, "completion_length": 141.234375, "epoch": 1.6745362563237776, "grad_norm": 4.908390198867559, "kl": 0.462890625, "learning_rate": 6.654300168634064e-07, "loss": 0.0005, "reward": 3.2098724842071533, "reward_std": 0.11346443742513657, "rewards/final_reward": 1.7597471616814055, "rewards/mask_iou_reward": 0.8798735808407028, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2098724842071533, "rewards/thk_ans_format_reward": 1.0, "step": 992, "think_completion_length": 44.3125 }, { "clip_ratio": 0.0, "completion_length": 124.15625, "epoch": 1.6762225969645868, "grad_norm": 18.70764305875646, "kl": 0.52734375, "learning_rate": 6.650927487352444e-07, "loss": 0.0005, "reward": 2.8987958431243896, "reward_std": 0.09079464711248875, "rewards/final_reward": 0.8476097891744324, "rewards/mask_iou_reward": 0.4238048945872162, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8987958431243896, "rewards/thk_ans_format_reward": 1.0, "step": 993, "think_completion_length": 54.25 }, { "clip_ratio": 0.0, "completion_length": 116.78125, "epoch": 1.6779089376053964, "grad_norm": 18.73896888137206, "kl": 0.513671875, "learning_rate": 6.647554806070826e-07, "loss": 0.0005, "reward": 3.5080639123916626, "reward_std": 0.06798750162124634, "rewards/final_reward": 1.4192495858629686, "rewards/mask_iou_reward": 0.7096247929314843, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5080639123916626, "rewards/thk_ans_format_reward": 1.0, "step": 994, "think_completion_length": 48.0 }, { "clip_ratio": 0.0, "completion_length": 152.46875, "epoch": 1.6795952782462056, "grad_norm": 12.262342821384445, "kl": 0.44140625, "learning_rate": 6.644182124789207e-07, "loss": 0.0004, "reward": 3.409273624420166, "reward_std": 0.3849910721182823, "rewards/final_reward": 1.5313665169886965, "rewards/mask_iou_reward": 0.7656832584943483, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 1.4717735648155212, "rewards/thk_ans_format_reward": 0.96875, "step": 995, "think_completion_length": 47.25 }, { "clip_ratio": 0.0, "completion_length": 117.359375, "epoch": 1.6812816188870152, "grad_norm": 8.38298900781314, "kl": 0.4580078125, "learning_rate": 6.640809443507588e-07, "loss": 0.0005, "reward": 3.8332111835479736, "reward_std": 0.04581199027597904, "rewards/final_reward": 1.7347238305560204, "rewards/mask_iou_reward": 0.8673619152780102, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8332111835479736, "rewards/thk_ans_format_reward": 1.0, "step": 996, "think_completion_length": 44.21875 }, { "clip_ratio": 0.0, "completion_length": 133.890625, "epoch": 1.6829679595278246, "grad_norm": 46.42672517479715, "kl": 1.2060546875, "learning_rate": 6.63743676222597e-07, "loss": 0.0012, "reward": 2.7516547441482544, "reward_std": 0.07285407930612564, "rewards/final_reward": 0.6759741659116778, "rewards/mask_iou_reward": 0.3379870829558389, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7516548335552216, "rewards/thk_ans_format_reward": 1.0, "step": 997, "think_completion_length": 50.34375 }, { "clip_ratio": 0.0, "completion_length": 126.4375, "epoch": 1.684654300168634, "grad_norm": 4.497527571240939, "kl": 0.4501953125, "learning_rate": 6.63406408094435e-07, "loss": 0.0005, "reward": 3.022408366203308, "reward_std": 0.2865653783082962, "rewards/final_reward": 1.0426820398855086, "rewards/mask_iou_reward": 0.5213410199427543, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0224083065986633, "rewards/thk_ans_format_reward": 1.0, "step": 998, "think_completion_length": 57.1875 }, { "clip_ratio": 0.0, "completion_length": 116.421875, "epoch": 1.6863406408094435, "grad_norm": 15.870394792828607, "kl": 0.5126953125, "learning_rate": 6.630691399662731e-07, "loss": 0.0005, "reward": 3.2935569286346436, "reward_std": 0.2325892373919487, "rewards/final_reward": 1.3165577341999402, "rewards/mask_iou_reward": 0.6582788670999701, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2935569882392883, "rewards/thk_ans_format_reward": 1.0, "step": 999, "think_completion_length": 44.0 }, { "clip_ratio": 0.0, "completion_length": 120.90625, "epoch": 1.6880269814502529, "grad_norm": 7.690590543519975, "kl": 0.5068359375, "learning_rate": 6.627318718381113e-07, "loss": 0.0005, "reward": 3.773517370223999, "reward_std": 0.15830123564228415, "rewards/final_reward": 1.6504267851557586, "rewards/mask_iou_reward": 0.8252133925778793, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.773517370223999, "rewards/thk_ans_format_reward": 1.0, "step": 1000, "think_completion_length": 52.34375 }, { "clip_ratio": 0.0, "completion_length": 115.90625, "epoch": 1.6897133220910625, "grad_norm": 21.874063040209965, "kl": 0.603515625, "learning_rate": 6.623946037099494e-07, "loss": 0.0006, "reward": 3.3424084186553955, "reward_std": 0.10551745275733992, "rewards/final_reward": 1.1636657158449948, "rewards/mask_iou_reward": 0.5818328579224974, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3424084186553955, "rewards/thk_ans_format_reward": 1.0, "step": 1001, "think_completion_length": 48.0 }, { "clip_ratio": 0.0, "completion_length": 123.484375, "epoch": 1.6913996627318717, "grad_norm": 9.269280540478269, "kl": 0.4208984375, "learning_rate": 6.620573355817874e-07, "loss": 0.0004, "reward": 3.7046056985855103, "reward_std": 0.07972065731883049, "rewards/final_reward": 1.7708265223477202, "rewards/mask_iou_reward": 0.8854132611738601, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7046056985855103, "rewards/thk_ans_format_reward": 1.0, "step": 1002, "think_completion_length": 59.875 }, { "clip_ratio": 0.0, "completion_length": 120.765625, "epoch": 1.6930860033726813, "grad_norm": 9.109123179163847, "kl": 0.494140625, "learning_rate": 6.617200674536256e-07, "loss": 0.0005, "reward": 2.643565535545349, "reward_std": 0.16619166731834412, "rewards/final_reward": 0.28977825331355156, "rewards/mask_iou_reward": 0.14488912665677578, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6435655355453491, "rewards/thk_ans_format_reward": 1.0, "step": 1003, "think_completion_length": 51.25 }, { "clip_ratio": 0.0, "completion_length": 122.0, "epoch": 1.6947723440134908, "grad_norm": 15.214547323566004, "kl": 0.498046875, "learning_rate": 6.613827993254637e-07, "loss": 0.0005, "reward": 3.6040256023406982, "reward_std": 0.1329221185296774, "rewards/final_reward": 1.6264060886231384, "rewards/mask_iou_reward": 0.8132030443115692, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6040256023406982, "rewards/thk_ans_format_reward": 1.0, "step": 1004, "think_completion_length": 51.34375 }, { "clip_ratio": 0.0, "completion_length": 120.796875, "epoch": 1.6964586846543002, "grad_norm": 5.992950266206152, "kl": 0.455078125, "learning_rate": 6.610455311973018e-07, "loss": 0.0005, "reward": 3.3491801023483276, "reward_std": 0.16978841368108988, "rewards/final_reward": 1.5574006048073583, "rewards/mask_iou_reward": 0.7787003024036792, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3491801619529724, "rewards/thk_ans_format_reward": 1.0, "step": 1005, "think_completion_length": 53.5625 }, { "clip_ratio": 0.0, "completion_length": 119.71875, "epoch": 1.6981450252951096, "grad_norm": 12.419592591697839, "kl": 0.4609375, "learning_rate": 6.6070826306914e-07, "loss": 0.0005, "reward": 3.398915648460388, "reward_std": 0.23545659333467484, "rewards/final_reward": 1.6019027950726148, "rewards/mask_iou_reward": 0.8009513975363074, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3989156484603882, "rewards/thk_ans_format_reward": 1.0, "step": 1006, "think_completion_length": 59.78125 }, { "clip_ratio": 0.0, "completion_length": 172.109375, "epoch": 1.699831365935919, "grad_norm": 3.8120154688685104, "kl": 0.41796875, "learning_rate": 6.60370994940978e-07, "loss": 0.0004, "reward": 2.801236391067505, "reward_std": 0.23255718499422073, "rewards/final_reward": 0.7371322477079272, "rewards/mask_iou_reward": 0.3685661238539636, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8012365102767944, "rewards/thk_ans_format_reward": 1.0, "step": 1007, "think_completion_length": 48.875 }, { "clip_ratio": 0.0, "completion_length": 120.078125, "epoch": 1.7015177065767286, "grad_norm": 7.748580989294214, "kl": 0.470703125, "learning_rate": 6.600337268128161e-07, "loss": 0.0005, "reward": 3.6431901454925537, "reward_std": 0.045935716829262674, "rewards/final_reward": 1.5224881620611639, "rewards/mask_iou_reward": 0.7612440810305819, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6431901454925537, "rewards/thk_ans_format_reward": 1.0, "step": 1008, "think_completion_length": 47.6875 }, { "clip_ratio": 0.0, "completion_length": 121.171875, "epoch": 1.7032040472175378, "grad_norm": 5.737195692191415, "kl": 0.494140625, "learning_rate": 6.596964586846543e-07, "loss": 0.0005, "reward": 3.3539552688598633, "reward_std": 0.16605842299759388, "rewards/final_reward": 1.6803083781412165, "rewards/mask_iou_reward": 0.8401541890706082, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.353955328464508, "rewards/thk_ans_format_reward": 1.0, "step": 1009, "think_completion_length": 52.25 }, { "clip_ratio": 0.0, "completion_length": 125.171875, "epoch": 1.7048903878583475, "grad_norm": 7.962368314621498, "kl": 0.548828125, "learning_rate": 6.593591905564924e-07, "loss": 0.0005, "reward": 2.8921409845352173, "reward_std": 0.24707718193531036, "rewards/final_reward": 0.5459787002187156, "rewards/mask_iou_reward": 0.2729893501093578, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8921409249305725, "rewards/thk_ans_format_reward": 1.0, "step": 1010, "think_completion_length": 47.125 }, { "clip_ratio": 0.0, "completion_length": 145.484375, "epoch": 1.7065767284991569, "grad_norm": 9.146835254517585, "kl": 0.486328125, "learning_rate": 6.590219224283306e-07, "loss": 0.0005, "reward": 3.0934470891952515, "reward_std": 0.16183524578809738, "rewards/final_reward": 0.7069903265430519, "rewards/mask_iou_reward": 0.35349516327152597, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0934468805789948, "rewards/thk_ans_format_reward": 1.0, "step": 1011, "think_completion_length": 53.8125 }, { "clip_ratio": 0.0, "completion_length": 134.203125, "epoch": 1.7082630691399663, "grad_norm": 13.175316522972508, "kl": 0.4267578125, "learning_rate": 6.586846543001686e-07, "loss": 0.0005, "reward": 3.282162666320801, "reward_std": 0.18916182965040207, "rewards/final_reward": 1.0668367459031367, "rewards/mask_iou_reward": 0.5334183729515684, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2821626663208008, "rewards/thk_ans_format_reward": 1.0, "step": 1012, "think_completion_length": 53.5625 }, { "clip_ratio": 0.0, "completion_length": 121.328125, "epoch": 1.7099494097807757, "grad_norm": 8.659680807686604, "kl": 0.45703125, "learning_rate": 6.583473861720067e-07, "loss": 0.0005, "reward": 3.1156177520751953, "reward_std": 0.08516193181276321, "rewards/final_reward": 0.5883904742783279, "rewards/mask_iou_reward": 0.29419523713916396, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.11561781167984, "rewards/thk_ans_format_reward": 1.0, "step": 1013, "think_completion_length": 52.09375 }, { "clip_ratio": 0.0, "completion_length": 116.734375, "epoch": 1.7116357504215851, "grad_norm": 18.041790037299258, "kl": 0.556640625, "learning_rate": 6.580101180438449e-07, "loss": 0.0006, "reward": 3.402442216873169, "reward_std": 0.08341848477721214, "rewards/final_reward": 1.1143318288277488, "rewards/mask_iou_reward": 0.5571659144138744, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.402442216873169, "rewards/thk_ans_format_reward": 1.0, "step": 1014, "think_completion_length": 51.84375 }, { "clip_ratio": 0.0, "completion_length": 119.46875, "epoch": 1.7133220910623947, "grad_norm": 11.7012394200138, "kl": 0.49609375, "learning_rate": 6.576728499156829e-07, "loss": 0.0005, "reward": 2.6576205492019653, "reward_std": 0.22013526409864426, "rewards/final_reward": 0.7089271269158894, "rewards/mask_iou_reward": 0.3544635634579447, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6576206088066101, "rewards/thk_ans_format_reward": 1.0, "step": 1015, "think_completion_length": 53.125 }, { "clip_ratio": 0.0, "completion_length": 126.46875, "epoch": 1.715008431703204, "grad_norm": 4.359995063594274, "kl": 0.4912109375, "learning_rate": 6.57335581787521e-07, "loss": 0.0005, "reward": 3.22420072555542, "reward_std": 0.28856465220451355, "rewards/final_reward": 1.3810333083133202, "rewards/mask_iou_reward": 0.6905166541566601, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.239825963973999, "rewards/thk_ans_format_reward": 1.0, "step": 1016, "think_completion_length": 59.6875 }, { "clip_ratio": 0.0, "completion_length": 150.515625, "epoch": 1.7166947723440136, "grad_norm": 4.3461415710926, "kl": 0.43359375, "learning_rate": 6.569983136593592e-07, "loss": 0.0004, "reward": 3.1074719429016113, "reward_std": 0.32912111282348633, "rewards/final_reward": 1.0721299991151474, "rewards/mask_iou_reward": 0.5360649995575737, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1074718832969666, "rewards/thk_ans_format_reward": 1.0, "step": 1017, "think_completion_length": 51.9375 }, { "clip_ratio": 0.0, "completion_length": 213.421875, "epoch": 1.718381112984823, "grad_norm": 21.260604756308766, "kl": 0.357421875, "learning_rate": 6.566610455311973e-07, "loss": 0.0004, "reward": 2.964912176132202, "reward_std": 0.21077851206064224, "rewards/final_reward": 1.1033498150252896, "rewards/mask_iou_reward": 0.5516749075126448, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9649122655391693, "rewards/thk_ans_format_reward": 1.0, "step": 1018, "think_completion_length": 59.9375 }, { "clip_ratio": 0.0, "completion_length": 120.8125, "epoch": 1.7200674536256324, "grad_norm": 7.706174182028309, "kl": 0.45703125, "learning_rate": 6.563237774030354e-07, "loss": 0.0005, "reward": 2.96855366230011, "reward_std": 0.18945128098130226, "rewards/final_reward": 0.8479449831806979, "rewards/mask_iou_reward": 0.42397249159034894, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9685536324977875, "rewards/thk_ans_format_reward": 1.0, "step": 1019, "think_completion_length": 47.03125 }, { "clip_ratio": 0.0, "completion_length": 128.46875, "epoch": 1.7217537942664418, "grad_norm": 9.21032483206, "kl": 0.470703125, "learning_rate": 6.559865092748735e-07, "loss": 0.0005, "reward": 3.0714797973632812, "reward_std": 0.18478820845484734, "rewards/final_reward": 0.9969208253996952, "rewards/mask_iou_reward": 0.4984604126998476, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.071479707956314, "rewards/thk_ans_format_reward": 1.0, "step": 1020, "think_completion_length": 55.40625 }, { "clip_ratio": 0.0, "completion_length": 119.796875, "epoch": 1.7234401349072512, "grad_norm": 11.380041316059248, "kl": 0.4501953125, "learning_rate": 6.556492411467116e-07, "loss": 0.0004, "reward": 3.2528512477874756, "reward_std": 0.24282580788712949, "rewards/final_reward": 0.6978428253976936, "rewards/mask_iou_reward": 0.3489214126988468, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2528512477874756, "rewards/thk_ans_format_reward": 1.0, "step": 1021, "think_completion_length": 52.25 }, { "clip_ratio": 0.0, "completion_length": 121.578125, "epoch": 1.7251264755480609, "grad_norm": 4.529523135510817, "kl": 0.462890625, "learning_rate": 6.553119730185497e-07, "loss": 0.0005, "reward": 3.537788510322571, "reward_std": 0.17213429510593414, "rewards/final_reward": 1.4058552515127203, "rewards/mask_iou_reward": 0.7029276257563601, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.537788450717926, "rewards/thk_ans_format_reward": 1.0, "step": 1022, "think_completion_length": 48.78125 }, { "clip_ratio": 0.0, "completion_length": 121.953125, "epoch": 1.72681281618887, "grad_norm": 7.594129251397134, "kl": 0.552734375, "learning_rate": 6.549747048903878e-07, "loss": 0.0006, "reward": 3.0862042903900146, "reward_std": 0.09378309547901154, "rewards/final_reward": 0.9123979252746075, "rewards/mask_iou_reward": 0.45619896263730375, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0862043499946594, "rewards/thk_ans_format_reward": 1.0, "step": 1023, "think_completion_length": 51.875 }, { "clip_ratio": 0.0, "completion_length": 120.890625, "epoch": 1.7284991568296797, "grad_norm": 15.814763062458056, "kl": 0.4609375, "learning_rate": 6.546374367622259e-07, "loss": 0.0005, "reward": 3.47658109664917, "reward_std": 0.09444395080208778, "rewards/final_reward": 1.546256486336178, "rewards/mask_iou_reward": 0.773128243168089, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4765812158584595, "rewards/thk_ans_format_reward": 1.0, "step": 1024, "think_completion_length": 55.90625 }, { "clip_ratio": 0.0, "completion_length": 126.0625, "epoch": 1.7301854974704889, "grad_norm": 23.226417626108393, "kl": 0.4951171875, "learning_rate": 6.54300168634064e-07, "loss": 0.0005, "reward": 3.4563424587249756, "reward_std": 0.15790753066539764, "rewards/final_reward": 1.3368036721575525, "rewards/mask_iou_reward": 0.6684018360787762, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4563424587249756, "rewards/thk_ans_format_reward": 1.0, "step": 1025, "think_completion_length": 53.1875 }, { "clip_ratio": 0.0, "completion_length": 119.140625, "epoch": 1.7318718381112985, "grad_norm": 4.910500372680354, "kl": 0.478515625, "learning_rate": 6.539629005059022e-07, "loss": 0.0005, "reward": 3.1884742975234985, "reward_std": 0.25331611186265945, "rewards/final_reward": 1.78840388322968, "rewards/mask_iou_reward": 0.89420194161484, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.204099178314209, "rewards/thk_ans_format_reward": 1.0, "step": 1026, "think_completion_length": 51.53125 }, { "clip_ratio": 0.0, "completion_length": 126.390625, "epoch": 1.733558178752108, "grad_norm": 8.4172557883391, "kl": 0.4970703125, "learning_rate": 6.536256323777403e-07, "loss": 0.0005, "reward": 3.1042816638946533, "reward_std": 0.08971688710153103, "rewards/final_reward": 1.287853213224611, "rewards/mask_iou_reward": 0.6439266066123055, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.104281485080719, "rewards/thk_ans_format_reward": 1.0, "step": 1027, "think_completion_length": 51.75 }, { "clip_ratio": 0.0, "completion_length": 119.875, "epoch": 1.7352445193929174, "grad_norm": 12.583209258616298, "kl": 0.4970703125, "learning_rate": 6.532883642495784e-07, "loss": 0.0005, "reward": 3.4771846532821655, "reward_std": 0.13392280414700508, "rewards/final_reward": 1.3426299767605006, "rewards/mask_iou_reward": 0.6713149883802503, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4771845936775208, "rewards/thk_ans_format_reward": 1.0, "step": 1028, "think_completion_length": 49.25 }, { "clip_ratio": 0.0, "completion_length": 167.8125, "epoch": 1.7369308600337268, "grad_norm": 5.460307618750845, "kl": 0.4345703125, "learning_rate": 6.529510961214165e-07, "loss": 0.0004, "reward": 2.8394622802734375, "reward_std": 0.2670469731092453, "rewards/final_reward": 0.8895338814851627, "rewards/mask_iou_reward": 0.44476694074258133, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.8550873398780823, "rewards/thk_ans_format_reward": 1.0, "step": 1029, "think_completion_length": 48.0625 }, { "clip_ratio": 0.0, "completion_length": 120.109375, "epoch": 1.7386172006745362, "grad_norm": 4.352846224632927, "kl": 0.4345703125, "learning_rate": 6.526138279932546e-07, "loss": 0.0004, "reward": 3.856082320213318, "reward_std": 0.011286142049357295, "rewards/final_reward": 1.8986783874918174, "rewards/mask_iou_reward": 0.9493391937459087, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8560824394226074, "rewards/thk_ans_format_reward": 1.0, "step": 1030, "think_completion_length": 50.21875 }, { "clip_ratio": 0.0, "completion_length": 118.6875, "epoch": 1.7403035413153458, "grad_norm": 9.118879145324778, "kl": 0.462890625, "learning_rate": 6.522765598650926e-07, "loss": 0.0005, "reward": 3.309417724609375, "reward_std": 0.452437125146389, "rewards/final_reward": 1.1533700789648118, "rewards/mask_iou_reward": 0.5766850394824059, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3094177842140198, "rewards/thk_ans_format_reward": 1.0, "step": 1031, "think_completion_length": 47.8125 }, { "clip_ratio": 0.0, "completion_length": 122.671875, "epoch": 1.741989881956155, "grad_norm": 9.320656417334392, "kl": 0.4638671875, "learning_rate": 6.519392917369308e-07, "loss": 0.0005, "reward": 3.5183212757110596, "reward_std": 0.22874368727207184, "rewards/final_reward": 1.4453660173187608, "rewards/mask_iou_reward": 0.7226830086593804, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5183210968971252, "rewards/thk_ans_format_reward": 1.0, "step": 1032, "think_completion_length": 57.3125 }, { "clip_ratio": 0.0, "completion_length": 167.234375, "epoch": 1.7436762225969646, "grad_norm": 6.520407909354839, "kl": 0.404296875, "learning_rate": 6.516020236087689e-07, "loss": 0.0004, "reward": 2.8486337661743164, "reward_std": 0.2386086881160736, "rewards/final_reward": 0.8908258070687297, "rewards/mask_iou_reward": 0.44541290353436486, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.848633736371994, "rewards/thk_ans_format_reward": 1.0, "step": 1033, "think_completion_length": 47.15625 }, { "clip_ratio": 0.0, "completion_length": 121.703125, "epoch": 1.745362563237774, "grad_norm": 41.13201708403674, "kl": 0.515625, "learning_rate": 6.51264755480607e-07, "loss": 0.0005, "reward": 3.380680561065674, "reward_std": 0.17691625840961933, "rewards/final_reward": 1.2292933076821977, "rewards/mask_iou_reward": 0.6146466538410988, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3806805610656738, "rewards/thk_ans_format_reward": 1.0, "step": 1034, "think_completion_length": 48.9375 }, { "clip_ratio": 0.0, "completion_length": 121.953125, "epoch": 1.7470489038785835, "grad_norm": 6.130999116033341, "kl": 0.4736328125, "learning_rate": 6.509274873524452e-07, "loss": 0.0004, "reward": 3.4069515466690063, "reward_std": 0.24042115407064557, "rewards/final_reward": 1.7034202245018428, "rewards/mask_iou_reward": 0.8517101122509214, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4069515466690063, "rewards/thk_ans_format_reward": 1.0, "step": 1035, "think_completion_length": 52.65625 }, { "clip_ratio": 0.0, "completion_length": 128.015625, "epoch": 1.7487352445193929, "grad_norm": 5.4172837704426575, "kl": 0.4267578125, "learning_rate": 6.505902192242833e-07, "loss": 0.0004, "reward": 2.547904133796692, "reward_std": 0.11162854917347431, "rewards/final_reward": 0.0386555181927269, "rewards/mask_iou_reward": 0.01932775909636345, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5479041039943695, "rewards/thk_ans_format_reward": 1.0, "step": 1036, "think_completion_length": 56.25 }, { "clip_ratio": 0.0, "completion_length": 121.5, "epoch": 1.7504215851602023, "grad_norm": 33.880899390677925, "kl": 0.46484375, "learning_rate": 6.502529510961215e-07, "loss": 0.0005, "reward": 3.2491201162338257, "reward_std": 0.16262406716123223, "rewards/final_reward": 1.4198772441300478, "rewards/mask_iou_reward": 0.7099386220650239, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2647451758384705, "rewards/thk_ans_format_reward": 0.984375, "step": 1037, "think_completion_length": 49.625 }, { "clip_ratio": 0.0, "completion_length": 123.59375, "epoch": 1.752107925801012, "grad_norm": 31.689644962105042, "kl": 0.513671875, "learning_rate": 6.499156829679595e-07, "loss": 0.0005, "reward": 3.308144211769104, "reward_std": 0.06957734003663063, "rewards/final_reward": 1.5038740533194743, "rewards/mask_iou_reward": 0.7519370266597372, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3081441521644592, "rewards/thk_ans_format_reward": 1.0, "step": 1038, "think_completion_length": 50.09375 }, { "clip_ratio": 0.0, "completion_length": 147.359375, "epoch": 1.7537942664418211, "grad_norm": 16.50416060782597, "kl": 0.8203125, "learning_rate": 6.495784148397976e-07, "loss": 0.0008, "reward": 2.7542325258255005, "reward_std": 0.45590740442276, "rewards/final_reward": 1.0102187496451316, "rewards/mask_iou_reward": 0.5051093748225658, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7542325109243393, "rewards/thk_ans_format_reward": 1.0, "step": 1039, "think_completion_length": 52.125 }, { "clip_ratio": 0.0, "completion_length": 146.640625, "epoch": 1.7554806070826308, "grad_norm": 6.722045575335457, "kl": 0.4248046875, "learning_rate": 6.492411467116357e-07, "loss": 0.0004, "reward": 3.158905863761902, "reward_std": 0.24892936274409294, "rewards/final_reward": 1.0586207550274105, "rewards/mask_iou_reward": 0.5293103775137052, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1589058637619019, "rewards/thk_ans_format_reward": 1.0, "step": 1040, "think_completion_length": 51.0625 }, { "clip_ratio": 0.0, "completion_length": 133.921875, "epoch": 1.7571669477234402, "grad_norm": 9.194304976346283, "kl": 0.4462890625, "learning_rate": 6.489038785834738e-07, "loss": 0.0004, "reward": 3.509106397628784, "reward_std": 0.1190731879323721, "rewards/final_reward": 1.588458335658146, "rewards/mask_iou_reward": 0.794229167829073, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5091063976287842, "rewards/thk_ans_format_reward": 1.0, "step": 1041, "think_completion_length": 65.8125 }, { "clip_ratio": 0.0, "completion_length": 124.640625, "epoch": 1.7588532883642496, "grad_norm": 25.54207241093614, "kl": 0.46484375, "learning_rate": 6.485666104553119e-07, "loss": 0.0005, "reward": 3.1248477697372437, "reward_std": 0.30804644525051117, "rewards/final_reward": 1.4215654945130256, "rewards/mask_iou_reward": 0.7107827472565128, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1248477101325989, "rewards/thk_ans_format_reward": 1.0, "step": 1042, "think_completion_length": 55.3125 }, { "clip_ratio": 0.0, "completion_length": 134.40625, "epoch": 1.760539629005059, "grad_norm": 6.3995610391893, "kl": 0.431640625, "learning_rate": 6.482293423271501e-07, "loss": 0.0004, "reward": 3.5307756662368774, "reward_std": 0.27365532889962196, "rewards/final_reward": 1.4586337692907971, "rewards/mask_iou_reward": 0.7293168846453986, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.530775785446167, "rewards/thk_ans_format_reward": 1.0, "step": 1043, "think_completion_length": 51.40625 }, { "clip_ratio": 0.0, "completion_length": 120.765625, "epoch": 1.7622259696458684, "grad_norm": 13.867074363938958, "kl": 0.5029296875, "learning_rate": 6.478920741989882e-07, "loss": 0.0005, "reward": 3.391425848007202, "reward_std": 0.10993809998035431, "rewards/final_reward": 1.2119668066644858, "rewards/mask_iou_reward": 0.6059834033322429, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3914258480072021, "rewards/thk_ans_format_reward": 1.0, "step": 1044, "think_completion_length": 48.0 }, { "clip_ratio": 0.0, "completion_length": 132.625, "epoch": 1.763912310286678, "grad_norm": 3.4706313534724047, "kl": 0.447265625, "learning_rate": 6.475548060708263e-07, "loss": 0.0004, "reward": 2.8153789043426514, "reward_std": 0.04241855535656214, "rewards/final_reward": 0.18875813571196698, "rewards/mask_iou_reward": 0.09437906785598349, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8153788447380066, "rewards/thk_ans_format_reward": 1.0, "step": 1045, "think_completion_length": 55.1875 }, { "clip_ratio": 0.0, "completion_length": 136.03125, "epoch": 1.7655986509274872, "grad_norm": 4.695301218534343, "kl": 0.5595703125, "learning_rate": 6.472175379426645e-07, "loss": 0.0005, "reward": 3.5663245916366577, "reward_std": 0.0903189332166221, "rewards/final_reward": 1.5521931867676244, "rewards/mask_iou_reward": 0.7760965933838122, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5663246512413025, "rewards/thk_ans_format_reward": 1.0, "step": 1046, "think_completion_length": 50.0625 }, { "clip_ratio": 0.0, "completion_length": 121.21875, "epoch": 1.7672849915682969, "grad_norm": 18.269976905096588, "kl": 0.4765625, "learning_rate": 6.468802698145025e-07, "loss": 0.0005, "reward": 3.310370087623596, "reward_std": 0.0983478156849742, "rewards/final_reward": 1.2876556004640443, "rewards/mask_iou_reward": 0.6438278002320221, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3103700280189514, "rewards/thk_ans_format_reward": 1.0, "step": 1047, "think_completion_length": 48.78125 }, { "clip_ratio": 0.0, "completion_length": 123.140625, "epoch": 1.768971332209106, "grad_norm": 20.295614577115845, "kl": 0.427734375, "learning_rate": 6.465430016863405e-07, "loss": 0.0004, "reward": 2.9448623657226562, "reward_std": 0.22916459874249995, "rewards/final_reward": 0.40654357120444046, "rewards/mask_iou_reward": 0.20327178560222023, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9448623657226562, "rewards/thk_ans_format_reward": 1.0, "step": 1048, "think_completion_length": 53.0 }, { "clip_ratio": 0.0, "completion_length": 132.78125, "epoch": 1.7706576728499157, "grad_norm": 9.65434244461805, "kl": 0.4228515625, "learning_rate": 6.462057335581787e-07, "loss": 0.0004, "reward": 3.411288022994995, "reward_std": 0.21771667152643204, "rewards/final_reward": 1.098073992844263, "rewards/mask_iou_reward": 0.5490369964221316, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4112881422042847, "rewards/thk_ans_format_reward": 1.0, "step": 1049, "think_completion_length": 71.125 }, { "clip_ratio": 0.0, "completion_length": 119.90625, "epoch": 1.7723440134907251, "grad_norm": 12.128173202319173, "kl": 0.4677734375, "learning_rate": 6.458684654300168e-07, "loss": 0.0005, "reward": 3.588056802749634, "reward_std": 0.1198611631989479, "rewards/final_reward": 1.7642558749700097, "rewards/mask_iou_reward": 0.8821279374850048, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5880568027496338, "rewards/thk_ans_format_reward": 1.0, "step": 1050, "think_completion_length": 50.53125 }, { "clip_ratio": 0.0, "completion_length": 140.59375, "epoch": 1.7740303541315345, "grad_norm": 9.552772331867274, "kl": 0.4638671875, "learning_rate": 6.455311973018549e-07, "loss": 0.0005, "reward": 2.649136185646057, "reward_std": 0.09959585964679718, "rewards/final_reward": 0.70501502889627, "rewards/mask_iou_reward": 0.352507514448135, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6491361558437347, "rewards/thk_ans_format_reward": 1.0, "step": 1051, "think_completion_length": 54.375 }, { "clip_ratio": 0.0, "completion_length": 135.5625, "epoch": 1.7757166947723442, "grad_norm": 7.4506685071560455, "kl": 0.51953125, "learning_rate": 6.451939291736931e-07, "loss": 0.0005, "reward": 3.7473593950271606, "reward_std": 0.021921713836491108, "rewards/final_reward": 1.6334566791793033, "rewards/mask_iou_reward": 0.8167283395896516, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7473594546318054, "rewards/thk_ans_format_reward": 1.0, "step": 1052, "think_completion_length": 68.4375 }, { "clip_ratio": 0.0, "completion_length": 135.25, "epoch": 1.7774030354131534, "grad_norm": 11.452336974810411, "kl": 0.431640625, "learning_rate": 6.448566610455312e-07, "loss": 0.0004, "reward": 3.4113532304763794, "reward_std": 0.15724964579567313, "rewards/final_reward": 1.3157614341154118, "rewards/mask_iou_reward": 0.6578807170577059, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4113531708717346, "rewards/thk_ans_format_reward": 1.0, "step": 1053, "think_completion_length": 60.625 }, { "clip_ratio": 0.0, "completion_length": 145.375, "epoch": 1.779089376053963, "grad_norm": 5.636753144188764, "kl": 0.4638671875, "learning_rate": 6.445193929173693e-07, "loss": 0.0005, "reward": 2.8758299350738525, "reward_std": 0.24051348865032196, "rewards/final_reward": 0.7168088626733424, "rewards/mask_iou_reward": 0.3584044313366712, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8758300244808197, "rewards/thk_ans_format_reward": 1.0, "step": 1054, "think_completion_length": 51.03125 }, { "clip_ratio": 0.0, "completion_length": 120.921875, "epoch": 1.7807757166947722, "grad_norm": 9.697318267357094, "kl": 0.5068359375, "learning_rate": 6.441821247892075e-07, "loss": 0.0005, "reward": 3.427851915359497, "reward_std": 0.16114804474636912, "rewards/final_reward": 1.7093380725822347, "rewards/mask_iou_reward": 0.8546690362911173, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4278518557548523, "rewards/thk_ans_format_reward": 1.0, "step": 1055, "think_completion_length": 49.09375 }, { "clip_ratio": 0.0, "completion_length": 126.75, "epoch": 1.7824620573355818, "grad_norm": 5.99894941422999, "kl": 0.4765625, "learning_rate": 6.438448566610454e-07, "loss": 0.0005, "reward": 3.261791706085205, "reward_std": 0.26013752818107605, "rewards/final_reward": 1.0026498346326282, "rewards/mask_iou_reward": 0.5013249173163141, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2617915272712708, "rewards/thk_ans_format_reward": 1.0, "step": 1056, "think_completion_length": 55.25 }, { "clip_ratio": 0.0, "completion_length": 125.890625, "epoch": 1.7841483979763912, "grad_norm": 18.634833449472875, "kl": 0.5947265625, "learning_rate": 6.435075885328835e-07, "loss": 0.0006, "reward": 3.4682400226593018, "reward_std": 0.09191236272454262, "rewards/final_reward": 1.3746822163096801, "rewards/mask_iou_reward": 0.6873411081548401, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4682400822639465, "rewards/thk_ans_format_reward": 1.0, "step": 1057, "think_completion_length": 52.28125 }, { "clip_ratio": 0.0, "completion_length": 153.140625, "epoch": 1.7858347386172007, "grad_norm": 13.903214719949169, "kl": 0.3984375, "learning_rate": 6.431703204047217e-07, "loss": 0.0004, "reward": 3.124882459640503, "reward_std": 0.22556371614336967, "rewards/final_reward": 1.3507318497634557, "rewards/mask_iou_reward": 0.6753659248817279, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.124882459640503, "rewards/thk_ans_format_reward": 1.0, "step": 1058, "think_completion_length": 62.03125 }, { "clip_ratio": 0.0, "completion_length": 121.609375, "epoch": 1.78752107925801, "grad_norm": 6.8004779635053465, "kl": 0.4423828125, "learning_rate": 6.428330522765598e-07, "loss": 0.0004, "reward": 3.142040967941284, "reward_std": 0.05950396414846182, "rewards/final_reward": 1.3599471646303916, "rewards/mask_iou_reward": 0.6799735823151958, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1420409381389618, "rewards/thk_ans_format_reward": 1.0, "step": 1059, "think_completion_length": 52.28125 }, { "clip_ratio": 0.0, "completion_length": 131.25, "epoch": 1.7892074198988195, "grad_norm": 11.336990333878964, "kl": 0.43359375, "learning_rate": 6.42495784148398e-07, "loss": 0.0004, "reward": 3.2227389812469482, "reward_std": 0.18827488273382187, "rewards/final_reward": 1.7645477845927253, "rewards/mask_iou_reward": 0.8822738922963627, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2227389216423035, "rewards/thk_ans_format_reward": 1.0, "step": 1060, "think_completion_length": 47.375 }, { "clip_ratio": 0.0, "completion_length": 124.25, "epoch": 1.7908937605396291, "grad_norm": 12.35064431303953, "kl": 0.4462890625, "learning_rate": 6.421585160202361e-07, "loss": 0.0004, "reward": 3.690014123916626, "reward_std": 0.12409292161464691, "rewards/final_reward": 1.561800469907261, "rewards/mask_iou_reward": 0.7809002349536305, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6900140643119812, "rewards/thk_ans_format_reward": 1.0, "step": 1061, "think_completion_length": 56.25 }, { "clip_ratio": 0.0, "completion_length": 118.171875, "epoch": 1.7925801011804383, "grad_norm": 5.196701064961896, "kl": 0.453125, "learning_rate": 6.418212478920742e-07, "loss": 0.0004, "reward": 3.6945523023605347, "reward_std": 0.13177293725311756, "rewards/final_reward": 1.6393652382683739, "rewards/mask_iou_reward": 0.8196826191341869, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6945521235466003, "rewards/thk_ans_format_reward": 1.0, "step": 1062, "think_completion_length": 52.34375 }, { "clip_ratio": 0.0, "completion_length": 132.9375, "epoch": 1.794266441821248, "grad_norm": 32.92921107957651, "kl": 0.4814453125, "learning_rate": 6.414839797639124e-07, "loss": 0.0005, "reward": 2.8126556873321533, "reward_std": 0.27009210735559464, "rewards/final_reward": 0.7486795351149669, "rewards/mask_iou_reward": 0.37433976755748344, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8126558065414429, "rewards/thk_ans_format_reward": 1.0, "step": 1063, "think_completion_length": 55.71875 }, { "clip_ratio": 0.0, "completion_length": 122.4375, "epoch": 1.7959527824620574, "grad_norm": 16.048422949673412, "kl": 0.53125, "learning_rate": 6.411467116357505e-07, "loss": 0.0005, "reward": 3.3275065422058105, "reward_std": 0.04513479955494404, "rewards/final_reward": 1.114819221313844, "rewards/mask_iou_reward": 0.557409610656922, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3275066614151, "rewards/thk_ans_format_reward": 1.0, "step": 1064, "think_completion_length": 46.65625 }, { "clip_ratio": 0.0, "completion_length": 133.953125, "epoch": 1.7976391231028668, "grad_norm": 4.704587642281763, "kl": 0.478515625, "learning_rate": 6.408094435075884e-07, "loss": 0.0005, "reward": 3.2085570096969604, "reward_std": 0.2231890894472599, "rewards/final_reward": 1.7234846949920737, "rewards/mask_iou_reward": 0.8617423474960368, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2085569500923157, "rewards/thk_ans_format_reward": 1.0, "step": 1065, "think_completion_length": 50.25 }, { "clip_ratio": 0.0, "completion_length": 120.328125, "epoch": 1.7993254637436762, "grad_norm": 7.694097018347998, "kl": 0.513671875, "learning_rate": 6.404721753794266e-07, "loss": 0.0005, "reward": 2.546392798423767, "reward_std": 0.24064208567142487, "rewards/final_reward": 0.6668356472255468, "rewards/mask_iou_reward": 0.3334178236127734, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5463928133249283, "rewards/thk_ans_format_reward": 1.0, "step": 1066, "think_completion_length": 42.5 }, { "clip_ratio": 0.0, "completion_length": 123.171875, "epoch": 1.8010118043844856, "grad_norm": 15.129890320245305, "kl": 0.4677734375, "learning_rate": 6.401349072512647e-07, "loss": 0.0005, "reward": 3.544395089149475, "reward_std": 0.04383156634867191, "rewards/final_reward": 1.2383304283535561, "rewards/mask_iou_reward": 0.6191652141767781, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5443950295448303, "rewards/thk_ans_format_reward": 1.0, "step": 1067, "think_completion_length": 47.28125 }, { "clip_ratio": 0.0, "completion_length": 121.515625, "epoch": 1.8026981450252952, "grad_norm": 10.479996959556871, "kl": 0.4609375, "learning_rate": 6.397976391231028e-07, "loss": 0.0005, "reward": 3.6842243671417236, "reward_std": 0.08089240174740553, "rewards/final_reward": 1.6949048454045466, "rewards/mask_iou_reward": 0.8474524227022733, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6842244267463684, "rewards/thk_ans_format_reward": 1.0, "step": 1068, "think_completion_length": 53.34375 }, { "clip_ratio": 0.0, "completion_length": 115.671875, "epoch": 1.8043844856661044, "grad_norm": 87.39047841596138, "kl": 0.453125, "learning_rate": 6.39460370994941e-07, "loss": 0.0005, "reward": 3.355344772338867, "reward_std": 0.19397838786244392, "rewards/final_reward": 1.704898139101287, "rewards/mask_iou_reward": 0.8524490695506435, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3553447723388672, "rewards/thk_ans_format_reward": 1.0, "step": 1069, "think_completion_length": 43.125 }, { "clip_ratio": 0.0, "completion_length": 117.359375, "epoch": 1.806070826306914, "grad_norm": 7.768601686887431, "kl": 0.4921875, "learning_rate": 6.391231028667791e-07, "loss": 0.0005, "reward": 2.9813655614852905, "reward_std": 0.12172066420316696, "rewards/final_reward": 0.9793375309635147, "rewards/mask_iou_reward": 0.48966876548175736, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9813654273748398, "rewards/thk_ans_format_reward": 1.0, "step": 1070, "think_completion_length": 48.75 }, { "clip_ratio": 0.0, "completion_length": 103.40625, "epoch": 1.8077571669477235, "grad_norm": 11.418688751202348, "kl": 0.49609375, "learning_rate": 6.387858347386172e-07, "loss": 0.0005, "reward": 3.1727246046066284, "reward_std": 0.2514045834541321, "rewards/final_reward": 1.5329492331264845, "rewards/mask_iou_reward": 0.7664746165632422, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1727246642112732, "rewards/thk_ans_format_reward": 1.0, "step": 1071, "think_completion_length": 39.34375 }, { "clip_ratio": 0.0, "completion_length": 115.953125, "epoch": 1.809443507588533, "grad_norm": 7.214437740443753, "kl": 0.630859375, "learning_rate": 6.384485666104554e-07, "loss": 0.0006, "reward": 3.3107919692993164, "reward_std": 0.15224889293313026, "rewards/final_reward": 0.9352677655935655, "rewards/mask_iou_reward": 0.46763388279678275, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3107921481132507, "rewards/thk_ans_format_reward": 1.0, "step": 1072, "think_completion_length": 48.21875 }, { "clip_ratio": 0.0, "completion_length": 124.125, "epoch": 1.8111298482293423, "grad_norm": 17.597183249832156, "kl": 0.4599609375, "learning_rate": 6.381112984822933e-07, "loss": 0.0005, "reward": 3.603898763656616, "reward_std": 0.1738036908209324, "rewards/final_reward": 1.9033266308019536, "rewards/mask_iou_reward": 0.9516633154009768, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6038986444473267, "rewards/thk_ans_format_reward": 1.0, "step": 1073, "think_completion_length": 52.28125 }, { "clip_ratio": 0.0, "completion_length": 123.375, "epoch": 1.8128161888701517, "grad_norm": 8.705159581822338, "kl": 0.494140625, "learning_rate": 6.377740303541314e-07, "loss": 0.0005, "reward": 3.3796963691711426, "reward_std": 0.16491149365901947, "rewards/final_reward": 1.393316190307897, "rewards/mask_iou_reward": 0.6966580951539485, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.379696547985077, "rewards/thk_ans_format_reward": 1.0, "step": 1074, "think_completion_length": 44.0625 }, { "clip_ratio": 0.0, "completion_length": 130.828125, "epoch": 1.8145025295109614, "grad_norm": 31.20759043501845, "kl": 0.451171875, "learning_rate": 6.374367622259696e-07, "loss": 0.0005, "reward": 3.3295449018478394, "reward_std": 0.12656425312161446, "rewards/final_reward": 1.7795149105856936, "rewards/mask_iou_reward": 0.8897574552928468, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3295449018478394, "rewards/thk_ans_format_reward": 1.0, "step": 1075, "think_completion_length": 42.8125 }, { "clip_ratio": 0.0, "completion_length": 111.703125, "epoch": 1.8161888701517706, "grad_norm": 5.87686245738229, "kl": 0.5361328125, "learning_rate": 6.370994940978077e-07, "loss": 0.0005, "reward": 3.0567972660064697, "reward_std": 0.2288198471069336, "rewards/final_reward": 1.0152084192726702, "rewards/mask_iou_reward": 0.5076042096363351, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0567971467971802, "rewards/thk_ans_format_reward": 1.0, "step": 1076, "think_completion_length": 38.875 }, { "clip_ratio": 0.0, "completion_length": 123.6875, "epoch": 1.8178752107925802, "grad_norm": 15.31134271285599, "kl": 0.486328125, "learning_rate": 6.367622259696458e-07, "loss": 0.0005, "reward": 3.3816200494766235, "reward_std": 0.0680837333202362, "rewards/final_reward": 1.6581806400046022, "rewards/mask_iou_reward": 0.8290903200023011, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3816199898719788, "rewards/thk_ans_format_reward": 1.0, "step": 1077, "think_completion_length": 47.5 }, { "clip_ratio": 0.0, "completion_length": 114.125, "epoch": 1.8195615514333894, "grad_norm": 39.16886392147183, "kl": 0.484375, "learning_rate": 6.36424957841484e-07, "loss": 0.0005, "reward": 3.5349349975585938, "reward_std": 0.10329584777355194, "rewards/final_reward": 1.5334088335683878, "rewards/mask_iou_reward": 0.7667044167841939, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5349348783493042, "rewards/thk_ans_format_reward": 1.0, "step": 1078, "think_completion_length": 41.0625 }, { "clip_ratio": 0.0, "completion_length": 148.5, "epoch": 1.821247892074199, "grad_norm": 7.611256628332114, "kl": 0.4736328125, "learning_rate": 6.360876897133221e-07, "loss": 0.0005, "reward": 3.3147194385528564, "reward_std": 0.074610386043787, "rewards/final_reward": 1.6858395633390624, "rewards/mask_iou_reward": 0.8429197816695312, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3147195279598236, "rewards/thk_ans_format_reward": 1.0, "step": 1079, "think_completion_length": 48.53125 }, { "clip_ratio": 0.0, "completion_length": 148.0, "epoch": 1.8229342327150084, "grad_norm": 12.382623549496923, "kl": 0.509765625, "learning_rate": 6.357504215851602e-07, "loss": 0.0005, "reward": 2.8961130380630493, "reward_std": 0.24354761838912964, "rewards/final_reward": 0.6990347408050159, "rewards/mask_iou_reward": 0.34951737040250797, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8961129784584045, "rewards/thk_ans_format_reward": 1.0, "step": 1080, "think_completion_length": 42.59375 }, { "clip_ratio": 0.0, "completion_length": 111.640625, "epoch": 1.8246205733558178, "grad_norm": 12.439318435799716, "kl": 0.5390625, "learning_rate": 6.354131534569983e-07, "loss": 0.0005, "reward": 3.2025067806243896, "reward_std": 0.10669799149036407, "rewards/final_reward": 1.0149730957567362, "rewards/mask_iou_reward": 0.5074865478783681, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.202506572008133, "rewards/thk_ans_format_reward": 1.0, "step": 1081, "think_completion_length": 45.1875 }, { "clip_ratio": 0.0, "completion_length": 127.4375, "epoch": 1.8263069139966275, "grad_norm": 6.201103280303299, "kl": 0.55078125, "learning_rate": 6.350758853288363e-07, "loss": 0.0006, "reward": 3.2389400005340576, "reward_std": 0.06579168047755957, "rewards/final_reward": 1.4973005004889282, "rewards/mask_iou_reward": 0.7486502502444641, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2389401197433472, "rewards/thk_ans_format_reward": 1.0, "step": 1082, "think_completion_length": 63.90625 }, { "clip_ratio": 0.0, "completion_length": 113.59375, "epoch": 1.8279932546374367, "grad_norm": 11.23582946892846, "kl": 0.466796875, "learning_rate": 6.347386172006744e-07, "loss": 0.0005, "reward": 3.2684438228607178, "reward_std": 0.19575618207454681, "rewards/final_reward": 1.6867246645460408, "rewards/mask_iou_reward": 0.8433623322730204, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.268443763256073, "rewards/thk_ans_format_reward": 1.0, "step": 1083, "think_completion_length": 40.4375 }, { "clip_ratio": 0.0, "completion_length": 117.765625, "epoch": 1.8296795952782463, "grad_norm": 12.126812671752475, "kl": 0.654296875, "learning_rate": 6.344013490725126e-07, "loss": 0.0007, "reward": 3.1550729274749756, "reward_std": 0.5026094168424606, "rewards/final_reward": 1.0971722753586644, "rewards/mask_iou_reward": 0.5485861376793322, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1550728678703308, "rewards/thk_ans_format_reward": 1.0, "step": 1084, "think_completion_length": 49.34375 }, { "clip_ratio": 0.0, "completion_length": 143.578125, "epoch": 1.8313659359190555, "grad_norm": 18.21812835633944, "kl": 0.4580078125, "learning_rate": 6.340640809443507e-07, "loss": 0.0005, "reward": 2.624867796897888, "reward_std": 0.22860441729426384, "rewards/final_reward": 0.8603130411544049, "rewards/mask_iou_reward": 0.43015652057720244, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.6404927968978882, "rewards/thk_ans_format_reward": 1.0, "step": 1085, "think_completion_length": 42.03125 }, { "clip_ratio": 0.0, "completion_length": 115.890625, "epoch": 1.8330522765598651, "grad_norm": 5.478540155142122, "kl": 0.5126953125, "learning_rate": 6.337268128161889e-07, "loss": 0.0005, "reward": 3.58796763420105, "reward_std": 0.012379450490698218, "rewards/final_reward": 1.3301058680886824, "rewards/mask_iou_reward": 0.6650529340443412, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.587967574596405, "rewards/thk_ans_format_reward": 1.0, "step": 1086, "think_completion_length": 42.9375 }, { "clip_ratio": 0.0, "completion_length": 118.515625, "epoch": 1.8347386172006745, "grad_norm": 13.300127679236585, "kl": 0.6044921875, "learning_rate": 6.33389544688027e-07, "loss": 0.0006, "reward": 3.036729335784912, "reward_std": 0.054497267585247755, "rewards/final_reward": 0.6875000276307961, "rewards/mask_iou_reward": 0.34375001381539805, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0367292761802673, "rewards/thk_ans_format_reward": 1.0, "step": 1087, "think_completion_length": 45.0 }, { "clip_ratio": 0.0, "completion_length": 113.328125, "epoch": 1.836424957841484, "grad_norm": 7.887733188962814, "kl": 0.46484375, "learning_rate": 6.330522765598651e-07, "loss": 0.0005, "reward": 2.800957202911377, "reward_std": 0.4332638531923294, "rewards/final_reward": 1.2232966038997053, "rewards/mask_iou_reward": 0.6116483019498526, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8009572625160217, "rewards/thk_ans_format_reward": 1.0, "step": 1088, "think_completion_length": 43.53125 }, { "clip_ratio": 0.0, "completion_length": 122.078125, "epoch": 1.8381112984822934, "grad_norm": 21.665998281230607, "kl": 0.482421875, "learning_rate": 6.327150084317033e-07, "loss": 0.0005, "reward": 3.037529468536377, "reward_std": 0.2697841115295887, "rewards/final_reward": 0.8968575313778724, "rewards/mask_iou_reward": 0.4484287656889362, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.037529468536377, "rewards/thk_ans_format_reward": 1.0, "step": 1089, "think_completion_length": 44.53125 }, { "clip_ratio": 0.0, "completion_length": 125.28125, "epoch": 1.8397976391231028, "grad_norm": 13.069209866194338, "kl": 0.4541015625, "learning_rate": 6.323777403035413e-07, "loss": 0.0005, "reward": 3.4228765964508057, "reward_std": 0.2066943645477295, "rewards/final_reward": 1.433857315425791, "rewards/mask_iou_reward": 0.7169286577128955, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4228765368461609, "rewards/thk_ans_format_reward": 1.0, "step": 1090, "think_completion_length": 47.6875 }, { "clip_ratio": 0.0, "completion_length": 122.21875, "epoch": 1.8414839797639124, "grad_norm": 6.749037357878424, "kl": 0.4404296875, "learning_rate": 6.320404721753793e-07, "loss": 0.0004, "reward": 2.5815987586975098, "reward_std": 0.1750339277787134, "rewards/final_reward": 0.39387072797870026, "rewards/mask_iou_reward": 0.19693536398935013, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5815986543893814, "rewards/thk_ans_format_reward": 1.0, "step": 1091, "think_completion_length": 50.3125 }, { "clip_ratio": 0.0, "completion_length": 125.1875, "epoch": 1.8431703204047216, "grad_norm": 19.693867653877927, "kl": 0.484375, "learning_rate": 6.317032040472175e-07, "loss": 0.0005, "reward": 3.2191600799560547, "reward_std": 0.34185342490673065, "rewards/final_reward": 1.082252412484555, "rewards/mask_iou_reward": 0.5411262062422775, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.219160258769989, "rewards/thk_ans_format_reward": 1.0, "step": 1092, "think_completion_length": 43.84375 }, { "clip_ratio": 0.0, "completion_length": 117.921875, "epoch": 1.8448566610455313, "grad_norm": 8.681066579359914, "kl": 0.50390625, "learning_rate": 6.313659359190556e-07, "loss": 0.0005, "reward": 3.290645480155945, "reward_std": 0.11531023494899273, "rewards/final_reward": 0.7578482514866762, "rewards/mask_iou_reward": 0.3789241257433381, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2906455397605896, "rewards/thk_ans_format_reward": 1.0, "step": 1093, "think_completion_length": 52.65625 }, { "clip_ratio": 0.0, "completion_length": 123.09375, "epoch": 1.8465430016863407, "grad_norm": 14.19594490203886, "kl": 0.501953125, "learning_rate": 6.310286677908937e-07, "loss": 0.0005, "reward": 2.789927124977112, "reward_std": 0.1311767096631229, "rewards/final_reward": 1.042739142866941, "rewards/mask_iou_reward": 0.5213695714334705, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7899271547794342, "rewards/thk_ans_format_reward": 1.0, "step": 1094, "think_completion_length": 49.28125 }, { "clip_ratio": 0.0, "completion_length": 115.046875, "epoch": 1.84822934232715, "grad_norm": 7.918474446662982, "kl": 0.44921875, "learning_rate": 6.306913996627319e-07, "loss": 0.0005, "reward": 3.8491674661636353, "reward_std": 0.022587507497519255, "rewards/final_reward": 1.8873137668664528, "rewards/mask_iou_reward": 0.9436568834332264, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8491673469543457, "rewards/thk_ans_format_reward": 1.0, "step": 1095, "think_completion_length": 46.5625 }, { "clip_ratio": 0.0, "completion_length": 119.65625, "epoch": 1.8499156829679595, "grad_norm": 8.515437657907258, "kl": 0.62109375, "learning_rate": 6.3035413153457e-07, "loss": 0.0006, "reward": 3.223744511604309, "reward_std": 0.384935200214386, "rewards/final_reward": 1.5109505036988033, "rewards/mask_iou_reward": 0.7554752518494017, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2237444519996643, "rewards/thk_ans_format_reward": 1.0, "step": 1096, "think_completion_length": 53.40625 }, { "clip_ratio": 0.0, "completion_length": 138.078125, "epoch": 1.851602023608769, "grad_norm": 7.834563260850435, "kl": 0.57421875, "learning_rate": 6.300168634064081e-07, "loss": 0.0006, "reward": 2.8379101753234863, "reward_std": 0.19312208145856857, "rewards/final_reward": 0.766568190172088, "rewards/mask_iou_reward": 0.383284095086044, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8379101008176804, "rewards/thk_ans_format_reward": 1.0, "step": 1097, "think_completion_length": 48.34375 }, { "clip_ratio": 0.0, "completion_length": 120.15625, "epoch": 1.8532883642495785, "grad_norm": 14.297124589618678, "kl": 0.4443359375, "learning_rate": 6.296795952782462e-07, "loss": 0.0004, "reward": 3.3441884517669678, "reward_std": 0.03978629596531391, "rewards/final_reward": 0.8399567347734211, "rewards/mask_iou_reward": 0.41997836738671057, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.344188630580902, "rewards/thk_ans_format_reward": 1.0, "step": 1098, "think_completion_length": 51.71875 }, { "clip_ratio": 0.0, "completion_length": 118.6875, "epoch": 1.8549747048903877, "grad_norm": 9.448604353193767, "kl": 0.818359375, "learning_rate": 6.293423271500843e-07, "loss": 0.0008, "reward": 2.577287793159485, "reward_std": 0.10695656202733517, "rewards/final_reward": 0.17409313455119524, "rewards/mask_iou_reward": 0.08704656727559762, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5772877894341946, "rewards/thk_ans_format_reward": 1.0, "step": 1099, "think_completion_length": 51.4375 }, { "clip_ratio": 0.0, "completion_length": 126.5, "epoch": 1.8566610455311974, "grad_norm": 41.754579974458586, "kl": 0.501953125, "learning_rate": 6.290050590219223e-07, "loss": 0.0005, "reward": 3.414603114128113, "reward_std": 0.07115489459829405, "rewards/final_reward": 1.2164036374620517, "rewards/mask_iou_reward": 0.6082018187310259, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4146031737327576, "rewards/thk_ans_format_reward": 1.0, "step": 1100, "think_completion_length": 50.75 }, { "clip_ratio": 0.0, "completion_length": 116.8125, "epoch": 1.8583473861720068, "grad_norm": 21.23442828983993, "kl": 0.611328125, "learning_rate": 6.286677908937605e-07, "loss": 0.0006, "reward": 3.3198102712631226, "reward_std": 0.045665791258215904, "rewards/final_reward": 1.6833926643806696, "rewards/mask_iou_reward": 0.8416963321903348, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.319810390472412, "rewards/thk_ans_format_reward": 1.0, "step": 1101, "think_completion_length": 42.34375 }, { "clip_ratio": 0.0, "completion_length": 118.1875, "epoch": 1.8600337268128162, "grad_norm": 7.060350179958177, "kl": 0.486328125, "learning_rate": 6.283305227655986e-07, "loss": 0.0005, "reward": 3.4817529916763306, "reward_std": 0.057911899872124195, "rewards/final_reward": 1.4049579396114342, "rewards/mask_iou_reward": 0.7024789698057171, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4817529320716858, "rewards/thk_ans_format_reward": 1.0, "step": 1102, "think_completion_length": 48.84375 }, { "clip_ratio": 0.0, "completion_length": 119.78125, "epoch": 1.8617200674536256, "grad_norm": 10.113922661906768, "kl": 0.4765625, "learning_rate": 6.279932546374367e-07, "loss": 0.0005, "reward": 2.7638001441955566, "reward_std": 0.13444151729345322, "rewards/final_reward": 0.11543181604880562, "rewards/mask_iou_reward": 0.05771590802440281, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7638000845909119, "rewards/thk_ans_format_reward": 1.0, "step": 1103, "think_completion_length": 48.25 }, { "clip_ratio": 0.0, "completion_length": 128.578125, "epoch": 1.863406408094435, "grad_norm": 12.2379733263627, "kl": 0.4501953125, "learning_rate": 6.276559865092749e-07, "loss": 0.0004, "reward": 2.952306628227234, "reward_std": 0.2903987839818001, "rewards/final_reward": 0.2940298889414863, "rewards/mask_iou_reward": 0.14701494447074315, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9523066282272339, "rewards/thk_ans_format_reward": 1.0, "step": 1104, "think_completion_length": 63.90625 }, { "clip_ratio": 0.0, "completion_length": 105.78125, "epoch": 1.8650927487352447, "grad_norm": 7.307629859355944, "kl": 0.458984375, "learning_rate": 6.27318718381113e-07, "loss": 0.0004, "reward": 3.6160776615142822, "reward_std": 0.26231749448925257, "rewards/final_reward": 1.918327592949087, "rewards/mask_iou_reward": 0.9591637964745435, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6160775423049927, "rewards/thk_ans_format_reward": 1.0, "step": 1105, "think_completion_length": 46.9375 }, { "clip_ratio": 0.0, "completion_length": 121.75, "epoch": 1.8667790893760539, "grad_norm": 8.120904059288417, "kl": 0.4375, "learning_rate": 6.26981450252951e-07, "loss": 0.0005, "reward": 2.6095887422561646, "reward_std": 0.36206041276454926, "rewards/final_reward": 0.18646807204091528, "rewards/mask_iou_reward": 0.09323403602045764, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6095886826515198, "rewards/thk_ans_format_reward": 1.0, "step": 1106, "think_completion_length": 55.0 }, { "clip_ratio": 0.0, "completion_length": 117.96875, "epoch": 1.8684654300168635, "grad_norm": 4.949742663888367, "kl": 0.4736328125, "learning_rate": 6.266441821247892e-07, "loss": 0.0005, "reward": 3.171900749206543, "reward_std": 0.1200435683131218, "rewards/final_reward": 1.7301323434279121, "rewards/mask_iou_reward": 0.8650661717139561, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1719006299972534, "rewards/thk_ans_format_reward": 1.0, "step": 1107, "think_completion_length": 49.25 }, { "clip_ratio": 0.0, "completion_length": 113.140625, "epoch": 1.8701517706576727, "grad_norm": 6.932244002142069, "kl": 0.501953125, "learning_rate": 6.263069139966273e-07, "loss": 0.0005, "reward": 2.9779754877090454, "reward_std": 0.1466265469789505, "rewards/final_reward": 1.129309473098064, "rewards/mask_iou_reward": 0.564654736549032, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9779754877090454, "rewards/thk_ans_format_reward": 1.0, "step": 1108, "think_completion_length": 42.84375 }, { "clip_ratio": 0.0, "completion_length": 118.140625, "epoch": 1.8718381112984823, "grad_norm": 7.343964169963569, "kl": 0.4931640625, "learning_rate": 6.259696458684654e-07, "loss": 0.0005, "reward": 2.8175435066223145, "reward_std": 0.016309996135532856, "rewards/final_reward": 0.8358996803031749, "rewards/mask_iou_reward": 0.4179498401515874, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8175435662269592, "rewards/thk_ans_format_reward": 1.0, "step": 1109, "think_completion_length": 48.6875 }, { "clip_ratio": 0.0, "completion_length": 118.609375, "epoch": 1.8735244519392917, "grad_norm": 5.530330908664622, "kl": 0.505859375, "learning_rate": 6.256323777403035e-07, "loss": 0.0005, "reward": 3.406356692314148, "reward_std": 0.13961811736226082, "rewards/final_reward": 1.3290365300351592, "rewards/mask_iou_reward": 0.6645182650175796, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4063568115234375, "rewards/thk_ans_format_reward": 1.0, "step": 1110, "think_completion_length": 45.59375 }, { "clip_ratio": 0.0, "completion_length": 117.28125, "epoch": 1.8752107925801011, "grad_norm": 5.96310263404028, "kl": 0.517578125, "learning_rate": 6.252951096121416e-07, "loss": 0.0005, "reward": 3.4238619804382324, "reward_std": 0.2606248203665018, "rewards/final_reward": 1.360026099518807, "rewards/mask_iou_reward": 0.6800130497594035, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4238619804382324, "rewards/thk_ans_format_reward": 1.0, "step": 1111, "think_completion_length": 47.53125 }, { "clip_ratio": 0.0, "completion_length": 130.90625, "epoch": 1.8768971332209108, "grad_norm": 6.039639555921078, "kl": 0.455078125, "learning_rate": 6.249578414839798e-07, "loss": 0.0005, "reward": 3.1803025007247925, "reward_std": 0.14678914099931717, "rewards/final_reward": 0.7781353902486867, "rewards/mask_iou_reward": 0.38906769512434336, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.18030247092247, "rewards/thk_ans_format_reward": 1.0, "step": 1112, "think_completion_length": 54.8125 }, { "clip_ratio": 0.0, "completion_length": 186.59375, "epoch": 1.87858347386172, "grad_norm": 8.678069428182924, "kl": 0.4697265625, "learning_rate": 6.246205733558179e-07, "loss": 0.0005, "reward": 3.3460100889205933, "reward_std": 0.2927638292312622, "rewards/final_reward": 1.6371790768525196, "rewards/mask_iou_reward": 0.8185895384262598, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.3616352081298828, "rewards/thk_ans_format_reward": 1.0, "step": 1113, "think_completion_length": 58.5625 }, { "clip_ratio": 0.0, "completion_length": 122.15625, "epoch": 1.8802698145025296, "grad_norm": 19.248741631074427, "kl": 0.462890625, "learning_rate": 6.242833052276559e-07, "loss": 0.0005, "reward": 3.0415929555892944, "reward_std": 0.15946677327156067, "rewards/final_reward": 0.6804740707632364, "rewards/mask_iou_reward": 0.3402370353816182, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0415929555892944, "rewards/thk_ans_format_reward": 1.0, "step": 1114, "think_completion_length": 55.4375 }, { "clip_ratio": 0.0, "completion_length": 123.859375, "epoch": 1.8819561551433388, "grad_norm": 4.513737480234631, "kl": 0.4755859375, "learning_rate": 6.239460370994941e-07, "loss": 0.0005, "reward": 2.9178273677825928, "reward_std": 0.13480617478489876, "rewards/final_reward": 0.7582901537480667, "rewards/mask_iou_reward": 0.37914507687403337, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9178274273872375, "rewards/thk_ans_format_reward": 1.0, "step": 1115, "think_completion_length": 59.78125 }, { "clip_ratio": 0.0, "completion_length": 120.984375, "epoch": 1.8836424957841484, "grad_norm": 9.499820735106061, "kl": 0.5, "learning_rate": 6.236087689713322e-07, "loss": 0.0005, "reward": 3.2555822134017944, "reward_std": 0.29104815423488617, "rewards/final_reward": 1.06362789651073, "rewards/mask_iou_reward": 0.531813948255365, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2555820941925049, "rewards/thk_ans_format_reward": 1.0, "step": 1116, "think_completion_length": 57.46875 }, { "clip_ratio": 0.0, "completion_length": 122.015625, "epoch": 1.8853288364249579, "grad_norm": 14.094908197904413, "kl": 0.47265625, "learning_rate": 6.232715008431702e-07, "loss": 0.0005, "reward": 3.2906464338302612, "reward_std": 0.14907943457365036, "rewards/final_reward": 0.9073393636444084, "rewards/mask_iou_reward": 0.4536696818222042, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2906464636325836, "rewards/thk_ans_format_reward": 1.0, "step": 1117, "think_completion_length": 57.0 }, { "clip_ratio": 0.0, "completion_length": 117.609375, "epoch": 1.8870151770657673, "grad_norm": 14.820800134042324, "kl": 0.458984375, "learning_rate": 6.229342327150084e-07, "loss": 0.0005, "reward": 3.3356423377990723, "reward_std": 0.33301595598459244, "rewards/final_reward": 1.497528428462273, "rewards/mask_iou_reward": 0.7487642142311365, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3356422781944275, "rewards/thk_ans_format_reward": 1.0, "step": 1118, "think_completion_length": 59.375 }, { "clip_ratio": 0.0, "completion_length": 121.75, "epoch": 1.8887015177065767, "grad_norm": 3.7472328819560543, "kl": 0.447265625, "learning_rate": 6.225969645868465e-07, "loss": 0.0004, "reward": 3.4780514240264893, "reward_std": 0.04463301133364439, "rewards/final_reward": 1.4461199073916258, "rewards/mask_iou_reward": 0.7230599536958129, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4780513644218445, "rewards/thk_ans_format_reward": 1.0, "step": 1119, "think_completion_length": 49.46875 }, { "clip_ratio": 0.0, "completion_length": 153.25, "epoch": 1.890387858347386, "grad_norm": 5.568276911965404, "kl": 0.474609375, "learning_rate": 6.222596964586846e-07, "loss": 0.0005, "reward": 3.1224676370620728, "reward_std": 0.19673360884189606, "rewards/final_reward": 1.381617815496326, "rewards/mask_iou_reward": 0.690808907748163, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.122467577457428, "rewards/thk_ans_format_reward": 1.0, "step": 1120, "think_completion_length": 45.9375 }, { "clip_ratio": 0.0, "completion_length": 128.828125, "epoch": 1.8920741989881957, "grad_norm": 29.02279025388352, "kl": 0.4521484375, "learning_rate": 6.219224283305228e-07, "loss": 0.0005, "reward": 3.0229815244674683, "reward_std": 0.2775159105658531, "rewards/final_reward": 1.3215004490247066, "rewards/mask_iou_reward": 0.6607502245123533, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0229815542697906, "rewards/thk_ans_format_reward": 1.0, "step": 1121, "think_completion_length": 50.21875 }, { "clip_ratio": 0.0, "completion_length": 152.15625, "epoch": 1.893760539629005, "grad_norm": 8.50671484702284, "kl": 0.4091796875, "learning_rate": 6.215851602023609e-07, "loss": 0.0004, "reward": 3.070726156234741, "reward_std": 0.4190548211336136, "rewards/final_reward": 1.7271931964343437, "rewards/mask_iou_reward": 0.8635965982171718, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0707260966300964, "rewards/thk_ans_format_reward": 1.0, "step": 1122, "think_completion_length": 52.65625 }, { "clip_ratio": 0.0, "completion_length": 124.6875, "epoch": 1.8954468802698146, "grad_norm": 41.49957656934944, "kl": 0.501953125, "learning_rate": 6.212478920741989e-07, "loss": 0.0005, "reward": 3.3182214498519897, "reward_std": 0.14836269989609718, "rewards/final_reward": 1.350961792262883, "rewards/mask_iou_reward": 0.6754808961314415, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3182214498519897, "rewards/thk_ans_format_reward": 1.0, "step": 1123, "think_completion_length": 54.5 }, { "clip_ratio": 0.0, "completion_length": 151.640625, "epoch": 1.897133220910624, "grad_norm": 6.733032689079996, "kl": 0.4013671875, "learning_rate": 6.209106239460371e-07, "loss": 0.0004, "reward": 3.5246121883392334, "reward_std": 0.2807541564106941, "rewards/final_reward": 1.4699369159761246, "rewards/mask_iou_reward": 0.7349684579880623, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5246121883392334, "rewards/thk_ans_format_reward": 1.0, "step": 1124, "think_completion_length": 50.3125 }, { "clip_ratio": 0.0, "completion_length": 116.609375, "epoch": 1.8988195615514334, "grad_norm": 12.621584714509966, "kl": 0.4697265625, "learning_rate": 6.205733558178752e-07, "loss": 0.0005, "reward": 3.1331878900527954, "reward_std": 0.12203128053806722, "rewards/final_reward": 0.42254711525684857, "rewards/mask_iou_reward": 0.21127355762842429, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.1488128304481506, "rewards/thk_ans_format_reward": 1.0, "step": 1125, "think_completion_length": 46.71875 }, { "clip_ratio": 0.0, "completion_length": 124.296875, "epoch": 1.9005059021922428, "grad_norm": 9.673146215837875, "kl": 0.4873046875, "learning_rate": 6.202360876897132e-07, "loss": 0.0005, "reward": 3.5485845804214478, "reward_std": 0.05991579405963421, "rewards/final_reward": 1.4075040653508908, "rewards/mask_iou_reward": 0.7037520326754454, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.548584520816803, "rewards/thk_ans_format_reward": 1.0, "step": 1126, "think_completion_length": 54.96875 }, { "clip_ratio": 0.0, "completion_length": 125.90625, "epoch": 1.9021922428330522, "grad_norm": 5.361577006546381, "kl": 0.541015625, "learning_rate": 6.198988195615514e-07, "loss": 0.0005, "reward": 3.0988335609436035, "reward_std": 0.1889047771692276, "rewards/final_reward": 1.1874666094459694, "rewards/mask_iou_reward": 0.5937333047229847, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0988333821296692, "rewards/thk_ans_format_reward": 1.0, "step": 1127, "think_completion_length": 54.5 }, { "clip_ratio": 0.0, "completion_length": 128.125, "epoch": 1.9038785834738619, "grad_norm": 5.952423108624937, "kl": 0.458984375, "learning_rate": 6.195615514333895e-07, "loss": 0.0005, "reward": 3.1148312091827393, "reward_std": 0.14229458943009377, "rewards/final_reward": 1.3390039678383898, "rewards/mask_iou_reward": 0.6695019839191949, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1148313283920288, "rewards/thk_ans_format_reward": 1.0, "step": 1128, "think_completion_length": 63.3125 }, { "clip_ratio": 0.0, "completion_length": 135.546875, "epoch": 1.905564924114671, "grad_norm": 8.254122000550824, "kl": 0.458984375, "learning_rate": 6.192242833052276e-07, "loss": 0.0005, "reward": 2.7562084197998047, "reward_std": 0.39115703105926514, "rewards/final_reward": 0.8909723524188679, "rewards/mask_iou_reward": 0.4454861762094339, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7562084496021271, "rewards/thk_ans_format_reward": 1.0, "step": 1129, "think_completion_length": 55.53125 }, { "clip_ratio": 0.0, "completion_length": 130.21875, "epoch": 1.9072512647554807, "grad_norm": 8.147271330894098, "kl": 0.447265625, "learning_rate": 6.188870151770658e-07, "loss": 0.0004, "reward": 3.0163233280181885, "reward_std": 0.12908070534467697, "rewards/final_reward": 0.8853229324141032, "rewards/mask_iou_reward": 0.4426614662070516, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.016323447227478, "rewards/thk_ans_format_reward": 1.0, "step": 1130, "think_completion_length": 54.625 }, { "clip_ratio": 0.0, "completion_length": 118.421875, "epoch": 1.90893760539629, "grad_norm": 14.783755350176113, "kl": 0.4609375, "learning_rate": 6.185497470489038e-07, "loss": 0.0005, "reward": 3.187541961669922, "reward_std": 0.23147797584533691, "rewards/final_reward": 1.5944909250270047, "rewards/mask_iou_reward": 0.7972454625135024, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.187541902065277, "rewards/thk_ans_format_reward": 1.0, "step": 1131, "think_completion_length": 52.0 }, { "clip_ratio": 0.0, "completion_length": 119.484375, "epoch": 1.9106239460370995, "grad_norm": 8.047496221321932, "kl": 0.4609375, "learning_rate": 6.182124789207419e-07, "loss": 0.0005, "reward": 3.281162738800049, "reward_std": 0.09780286997556686, "rewards/final_reward": 0.9920333978301556, "rewards/mask_iou_reward": 0.4960166989150778, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.281162679195404, "rewards/thk_ans_format_reward": 1.0, "step": 1132, "think_completion_length": 45.71875 }, { "clip_ratio": 0.0, "completion_length": 119.890625, "epoch": 1.912310286677909, "grad_norm": 9.426326655888504, "kl": 0.4375, "learning_rate": 6.178752107925801e-07, "loss": 0.0004, "reward": 3.7058013677597046, "reward_std": 0.16382237616926432, "rewards/final_reward": 1.8595333019630167, "rewards/mask_iou_reward": 0.9297666509815083, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7058013081550598, "rewards/thk_ans_format_reward": 1.0, "step": 1133, "think_completion_length": 45.46875 }, { "clip_ratio": 0.0, "completion_length": 126.46875, "epoch": 1.9139966273187183, "grad_norm": 13.810244184431745, "kl": 0.4287109375, "learning_rate": 6.175379426644182e-07, "loss": 0.0004, "reward": 2.5829213857650757, "reward_std": 0.23861295729875565, "rewards/final_reward": 0.649385258994511, "rewards/mask_iou_reward": 0.3246926294972555, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5829213559627533, "rewards/thk_ans_format_reward": 1.0, "step": 1134, "think_completion_length": 52.625 }, { "clip_ratio": 0.0, "completion_length": 152.546875, "epoch": 1.915682967959528, "grad_norm": 7.353119304271834, "kl": 0.4150390625, "learning_rate": 6.172006745362564e-07, "loss": 0.0004, "reward": 3.28650164604187, "reward_std": 0.1776575818657875, "rewards/final_reward": 1.434402230781888, "rewards/mask_iou_reward": 0.717201115390944, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2865016460418701, "rewards/thk_ans_format_reward": 1.0, "step": 1135, "think_completion_length": 48.03125 }, { "clip_ratio": 0.0, "completion_length": 145.21875, "epoch": 1.9173693086003372, "grad_norm": 9.605920969673068, "kl": 0.408203125, "learning_rate": 6.168634064080944e-07, "loss": 0.0004, "reward": 3.2172775268554688, "reward_std": 0.1914580576121807, "rewards/final_reward": 1.293543048601166, "rewards/mask_iou_reward": 0.646771524300583, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2172774970531464, "rewards/thk_ans_format_reward": 1.0, "step": 1136, "think_completion_length": 55.0625 }, { "clip_ratio": 0.0, "completion_length": 143.46875, "epoch": 1.9190556492411468, "grad_norm": 5.6346964134655675, "kl": 0.591796875, "learning_rate": 6.165261382799325e-07, "loss": 0.0006, "reward": 3.438536763191223, "reward_std": 0.18300874158740044, "rewards/final_reward": 1.463139205462022, "rewards/mask_iou_reward": 0.731569602731011, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4385367631912231, "rewards/thk_ans_format_reward": 1.0, "step": 1137, "think_completion_length": 53.78125 }, { "clip_ratio": 0.0, "completion_length": 122.4375, "epoch": 1.920741989881956, "grad_norm": 8.352522402689223, "kl": 0.513671875, "learning_rate": 6.161888701517707e-07, "loss": 0.0005, "reward": 3.0348947048187256, "reward_std": 0.09543421119451523, "rewards/final_reward": 0.6401037795648319, "rewards/mask_iou_reward": 0.32005188978241594, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0348947048187256, "rewards/thk_ans_format_reward": 1.0, "step": 1138, "think_completion_length": 54.53125 }, { "clip_ratio": 0.0, "completion_length": 155.15625, "epoch": 1.9224283305227656, "grad_norm": 12.679459677972318, "kl": 0.400390625, "learning_rate": 6.158516020236087e-07, "loss": 0.0004, "reward": 2.949214816093445, "reward_std": 0.2778293192386627, "rewards/final_reward": 1.1230542828676453, "rewards/mask_iou_reward": 0.5615271414338227, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9492146372795105, "rewards/thk_ans_format_reward": 1.0, "step": 1139, "think_completion_length": 50.6875 }, { "clip_ratio": 0.0, "completion_length": 144.546875, "epoch": 1.924114671163575, "grad_norm": 8.364381572643524, "kl": 0.42578125, "learning_rate": 6.155143338954468e-07, "loss": 0.0004, "reward": 3.144506812095642, "reward_std": 0.08507668972015381, "rewards/final_reward": 0.7166029162035097, "rewards/mask_iou_reward": 0.35830145810175484, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1445067524909973, "rewards/thk_ans_format_reward": 1.0, "step": 1140, "think_completion_length": 55.5 }, { "clip_ratio": 0.0, "completion_length": 123.734375, "epoch": 1.9258010118043845, "grad_norm": 122.5712585070957, "kl": 0.4560546875, "learning_rate": 6.15177065767285e-07, "loss": 0.0005, "reward": 2.9750025272369385, "reward_std": 0.1318796332925558, "rewards/final_reward": 0.9958614977067766, "rewards/mask_iou_reward": 0.4979307488533883, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9750024378299713, "rewards/thk_ans_format_reward": 1.0, "step": 1141, "think_completion_length": 54.0 }, { "clip_ratio": 0.0, "completion_length": 128.828125, "epoch": 1.927487352445194, "grad_norm": 17.82613440604544, "kl": 0.458984375, "learning_rate": 6.148397976391231e-07, "loss": 0.0005, "reward": 3.3588626384735107, "reward_std": 0.18949565291404724, "rewards/final_reward": 1.3248426813549425, "rewards/mask_iou_reward": 0.6624213406774713, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.358862578868866, "rewards/thk_ans_format_reward": 1.0, "step": 1142, "think_completion_length": 51.53125 }, { "clip_ratio": 0.0, "completion_length": 124.546875, "epoch": 1.9291736930860033, "grad_norm": 90.73018968754026, "kl": 0.4677734375, "learning_rate": 6.145025295109612e-07, "loss": 0.0005, "reward": 3.493720531463623, "reward_std": 0.12904378399252892, "rewards/final_reward": 1.0748831457430095, "rewards/mask_iou_reward": 0.5374415728715047, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.493720293045044, "rewards/thk_ans_format_reward": 1.0, "step": 1143, "think_completion_length": 50.71875 }, { "clip_ratio": 0.0, "completion_length": 121.734375, "epoch": 1.930860033726813, "grad_norm": 7.788511637518061, "kl": 0.4443359375, "learning_rate": 6.141652613827993e-07, "loss": 0.0005, "reward": 3.612895131111145, "reward_std": 0.06411353871226311, "rewards/final_reward": 1.4882109120853437, "rewards/mask_iou_reward": 0.7441054560426719, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6128951907157898, "rewards/thk_ans_format_reward": 1.0, "step": 1144, "think_completion_length": 50.4375 }, { "clip_ratio": 0.0, "completion_length": 145.859375, "epoch": 1.932546374367622, "grad_norm": 18.964191292290973, "kl": 0.45703125, "learning_rate": 6.138279932546374e-07, "loss": 0.0005, "reward": 3.2941290140151978, "reward_std": 0.17387644201517105, "rewards/final_reward": 0.8750820196472904, "rewards/mask_iou_reward": 0.4375410098236452, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2941290438175201, "rewards/thk_ans_format_reward": 1.0, "step": 1145, "think_completion_length": 56.21875 }, { "clip_ratio": 0.0, "completion_length": 120.015625, "epoch": 1.9342327150084317, "grad_norm": 5.3943910580571774, "kl": 0.4775390625, "learning_rate": 6.134907251264755e-07, "loss": 0.0005, "reward": 3.3576735258102417, "reward_std": 0.27631305903196335, "rewards/final_reward": 1.9525827807939062, "rewards/mask_iou_reward": 0.9762913903969531, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3576735258102417, "rewards/thk_ans_format_reward": 1.0, "step": 1146, "think_completion_length": 50.875 }, { "clip_ratio": 0.0, "completion_length": 127.078125, "epoch": 1.9359190556492412, "grad_norm": 5.605297528578439, "kl": 0.53125, "learning_rate": 6.131534569983137e-07, "loss": 0.0005, "reward": 3.4034253358840942, "reward_std": 0.3214539512991905, "rewards/final_reward": 1.538946388924106, "rewards/mask_iou_reward": 0.769473194462053, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4034252166748047, "rewards/thk_ans_format_reward": 1.0, "step": 1147, "think_completion_length": 52.15625 }, { "clip_ratio": 0.0, "completion_length": 121.625, "epoch": 1.9376053962900506, "grad_norm": 11.819715756456931, "kl": 0.4873046875, "learning_rate": 6.128161888701517e-07, "loss": 0.0005, "reward": 3.4282305240631104, "reward_std": 0.24819158017635345, "rewards/final_reward": 1.0713962238537234, "rewards/mask_iou_reward": 0.5356981119268617, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4282305240631104, "rewards/thk_ans_format_reward": 1.0, "step": 1148, "think_completion_length": 47.90625 }, { "clip_ratio": 0.0, "completion_length": 145.265625, "epoch": 1.93929173693086, "grad_norm": 13.390608371093085, "kl": 0.4345703125, "learning_rate": 6.124789207419898e-07, "loss": 0.0004, "reward": 3.344928026199341, "reward_std": 0.26076687313616276, "rewards/final_reward": 1.62952084881511, "rewards/mask_iou_reward": 0.814760424407555, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.3761780261993408, "rewards/thk_ans_format_reward": 0.984375, "step": 1149, "think_completion_length": 47.8125 }, { "clip_ratio": 0.0, "completion_length": 108.0, "epoch": 1.9409780775716694, "grad_norm": 11.501195267830298, "kl": 0.4384765625, "learning_rate": 6.12141652613828e-07, "loss": 0.0004, "reward": 3.365402102470398, "reward_std": 0.43733248114585876, "rewards/final_reward": 1.8439775256589257, "rewards/mask_iou_reward": 0.9219887628294628, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3654021620750427, "rewards/thk_ans_format_reward": 1.0, "step": 1150, "think_completion_length": 48.4375 }, { "clip_ratio": 0.0, "completion_length": 121.71875, "epoch": 1.942664418212479, "grad_norm": 9.338780992051273, "kl": 0.484375, "learning_rate": 6.118043844856661e-07, "loss": 0.0005, "reward": 3.5329582691192627, "reward_std": 0.04067577584646642, "rewards/final_reward": 1.7367798965568824, "rewards/mask_iou_reward": 0.8683899482784412, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5329583883285522, "rewards/thk_ans_format_reward": 1.0, "step": 1151, "think_completion_length": 48.53125 }, { "clip_ratio": 0.0, "completion_length": 116.640625, "epoch": 1.9443507588532882, "grad_norm": 4.906728729430427, "kl": 0.46484375, "learning_rate": 6.114671163575042e-07, "loss": 0.0004, "reward": 3.2201439142227173, "reward_std": 0.1605071723461151, "rewards/final_reward": 1.255439579437334, "rewards/mask_iou_reward": 0.627719789718667, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2201440632343292, "rewards/thk_ans_format_reward": 1.0, "step": 1152, "think_completion_length": 44.34375 }, { "clip_ratio": 0.0, "completion_length": 126.171875, "epoch": 1.9460370994940979, "grad_norm": 8.460179438461045, "kl": 0.5126953125, "learning_rate": 6.111298482293423e-07, "loss": 0.0005, "reward": 3.0460928678512573, "reward_std": 0.18447109311819077, "rewards/final_reward": 0.9994140920300079, "rewards/mask_iou_reward": 0.49970704601500393, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0460927784442902, "rewards/thk_ans_format_reward": 1.0, "step": 1153, "think_completion_length": 55.875 }, { "clip_ratio": 0.0, "completion_length": 131.625, "epoch": 1.9477234401349073, "grad_norm": 7.523583427820683, "kl": 0.501953125, "learning_rate": 6.107925801011804e-07, "loss": 0.0005, "reward": 3.3284194469451904, "reward_std": 0.32087790966033936, "rewards/final_reward": 1.168077053645009, "rewards/mask_iou_reward": 0.5840385268225045, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3284194469451904, "rewards/thk_ans_format_reward": 1.0, "step": 1154, "think_completion_length": 55.59375 }, { "clip_ratio": 0.0, "completion_length": 120.71875, "epoch": 1.9494097807757167, "grad_norm": 6.853582812271886, "kl": 0.474609375, "learning_rate": 6.104553119730185e-07, "loss": 0.0005, "reward": 3.2801350355148315, "reward_std": 0.148701723664999, "rewards/final_reward": 1.2226313129202175, "rewards/mask_iou_reward": 0.6113156564601088, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.280134916305542, "rewards/thk_ans_format_reward": 1.0, "step": 1155, "think_completion_length": 46.125 }, { "clip_ratio": 0.0, "completion_length": 123.046875, "epoch": 1.951096121416526, "grad_norm": 5.466248056159601, "kl": 0.458984375, "learning_rate": 6.101180438448566e-07, "loss": 0.0005, "reward": 2.5636887550354004, "reward_std": 0.048969279043376446, "rewards/final_reward": 0.46655739556382414, "rewards/mask_iou_reward": 0.23327869778191207, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5636887392029166, "rewards/thk_ans_format_reward": 1.0, "step": 1156, "think_completion_length": 49.625 }, { "clip_ratio": 0.0, "completion_length": 141.171875, "epoch": 1.9527824620573355, "grad_norm": 28.18428620248401, "kl": 0.4375, "learning_rate": 6.097807757166947e-07, "loss": 0.0004, "reward": 3.68328320980072, "reward_std": 0.29689711332321167, "rewards/final_reward": 1.7442000166514389, "rewards/mask_iou_reward": 0.8721000083257194, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6832832098007202, "rewards/thk_ans_format_reward": 1.0, "step": 1157, "think_completion_length": 49.375 }, { "clip_ratio": 0.0, "completion_length": 121.953125, "epoch": 1.9544688026981452, "grad_norm": 6.4102496244791345, "kl": 0.482421875, "learning_rate": 6.094435075885328e-07, "loss": 0.0005, "reward": 3.239464044570923, "reward_std": 0.1461598314344883, "rewards/final_reward": 1.5674991248390242, "rewards/mask_iou_reward": 0.7837495624195121, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2394639253616333, "rewards/thk_ans_format_reward": 1.0, "step": 1158, "think_completion_length": 49.65625 }, { "clip_ratio": 0.0, "completion_length": 133.625, "epoch": 1.9561551433389543, "grad_norm": 25.05812311249102, "kl": 0.470703125, "learning_rate": 6.09106239460371e-07, "loss": 0.0005, "reward": 3.1786834001541138, "reward_std": 0.06434584688395262, "rewards/final_reward": 0.6210890595120667, "rewards/mask_iou_reward": 0.31054452975603336, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1786834001541138, "rewards/thk_ans_format_reward": 1.0, "step": 1159, "think_completion_length": 43.53125 }, { "clip_ratio": 0.0, "completion_length": 121.8125, "epoch": 1.957841483979764, "grad_norm": 26.698160368369376, "kl": 0.49609375, "learning_rate": 6.087689713322091e-07, "loss": 0.0005, "reward": 3.458927035331726, "reward_std": 0.11408116295933723, "rewards/final_reward": 1.2309284164893097, "rewards/mask_iou_reward": 0.6154642082446549, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4589269161224365, "rewards/thk_ans_format_reward": 1.0, "step": 1160, "think_completion_length": 53.75 }, { "clip_ratio": 0.0, "completion_length": 115.453125, "epoch": 1.9595278246205734, "grad_norm": 9.673867972324873, "kl": 0.46875, "learning_rate": 6.084317032040473e-07, "loss": 0.0005, "reward": 3.612801671028137, "reward_std": 0.14205888658761978, "rewards/final_reward": 1.7749510858760662, "rewards/mask_iou_reward": 0.8874755429380331, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6128017902374268, "rewards/thk_ans_format_reward": 1.0, "step": 1161, "think_completion_length": 49.375 }, { "clip_ratio": 0.0, "completion_length": 117.296875, "epoch": 1.9612141652613828, "grad_norm": 5.787770039930185, "kl": 0.4521484375, "learning_rate": 6.080944350758853e-07, "loss": 0.0004, "reward": 3.0567870140075684, "reward_std": 0.25405317917466164, "rewards/final_reward": 1.1140749716155698, "rewards/mask_iou_reward": 0.5570374858077849, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0567870736122131, "rewards/thk_ans_format_reward": 1.0, "step": 1162, "think_completion_length": 52.9375 }, { "clip_ratio": 0.0, "completion_length": 122.671875, "epoch": 1.9629005059021922, "grad_norm": 9.320319779187539, "kl": 0.53125, "learning_rate": 6.077571669477234e-07, "loss": 0.0005, "reward": 3.3574907779693604, "reward_std": 0.14523665606975555, "rewards/final_reward": 1.5528862513209964, "rewards/mask_iou_reward": 0.7764431256604982, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3574907779693604, "rewards/thk_ans_format_reward": 1.0, "step": 1163, "think_completion_length": 51.5625 }, { "clip_ratio": 0.0, "completion_length": 117.65625, "epoch": 1.9645868465430016, "grad_norm": 11.481249074007033, "kl": 0.50390625, "learning_rate": 6.074198988195615e-07, "loss": 0.0005, "reward": 3.671083688735962, "reward_std": 0.1421994436532259, "rewards/final_reward": 1.7619791945670902, "rewards/mask_iou_reward": 0.8809895972835451, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6710836291313171, "rewards/thk_ans_format_reward": 1.0, "step": 1164, "think_completion_length": 43.90625 }, { "clip_ratio": 0.0, "completion_length": 120.96875, "epoch": 1.9662731871838113, "grad_norm": 10.034131798115844, "kl": 0.478515625, "learning_rate": 6.070826306913996e-07, "loss": 0.0005, "reward": 3.3944214582443237, "reward_std": 0.1728556640446186, "rewards/final_reward": 1.7270548646100559, "rewards/mask_iou_reward": 0.8635274323050279, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3944214880466461, "rewards/thk_ans_format_reward": 1.0, "step": 1165, "think_completion_length": 46.71875 }, { "clip_ratio": 0.0, "completion_length": 119.203125, "epoch": 1.9679595278246205, "grad_norm": 8.306976055936005, "kl": 0.494140625, "learning_rate": 6.067453625632377e-07, "loss": 0.0005, "reward": 3.1458224058151245, "reward_std": 0.10424304194748402, "rewards/final_reward": 1.2310617441756069, "rewards/mask_iou_reward": 0.6155308720878034, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.145822286605835, "rewards/thk_ans_format_reward": 1.0, "step": 1166, "think_completion_length": 49.0625 }, { "clip_ratio": 0.0, "completion_length": 120.671875, "epoch": 1.96964586846543, "grad_norm": 6.85253908949388, "kl": 0.484375, "learning_rate": 6.064080944350759e-07, "loss": 0.0005, "reward": 3.0486035346984863, "reward_std": 0.2735901027917862, "rewards/final_reward": 0.955588583381466, "rewards/mask_iou_reward": 0.477794291690733, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0486035346984863, "rewards/thk_ans_format_reward": 1.0, "step": 1167, "think_completion_length": 53.78125 }, { "clip_ratio": 0.0, "completion_length": 123.3125, "epoch": 1.9713322091062393, "grad_norm": 8.427958007136285, "kl": 0.474609375, "learning_rate": 6.06070826306914e-07, "loss": 0.0005, "reward": 3.0173048973083496, "reward_std": 0.37154044955968857, "rewards/final_reward": 0.867946832341649, "rewards/mask_iou_reward": 0.4339734161708245, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0173049569129944, "rewards/thk_ans_format_reward": 1.0, "step": 1168, "think_completion_length": 51.5 }, { "clip_ratio": 0.0, "completion_length": 121.625, "epoch": 1.973018549747049, "grad_norm": 7.79990454182524, "kl": 0.48828125, "learning_rate": 6.057335581787521e-07, "loss": 0.0005, "reward": 2.6458925008773804, "reward_std": 0.4773362725973129, "rewards/final_reward": 0.6373366666011101, "rewards/mask_iou_reward": 0.31866833330055505, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6458925604820251, "rewards/thk_ans_format_reward": 1.0, "step": 1169, "think_completion_length": 47.90625 }, { "clip_ratio": 0.0, "completion_length": 118.625, "epoch": 1.9747048903878583, "grad_norm": 4.4719299493800975, "kl": 0.4609375, "learning_rate": 6.053962900505903e-07, "loss": 0.0004, "reward": 3.413856029510498, "reward_std": 0.014244536869227886, "rewards/final_reward": 1.2102401186865768, "rewards/mask_iou_reward": 0.6051200593432884, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.413856029510498, "rewards/thk_ans_format_reward": 1.0, "step": 1170, "think_completion_length": 46.59375 }, { "clip_ratio": 0.0, "completion_length": 120.03125, "epoch": 1.9763912310286678, "grad_norm": 6.177077156973483, "kl": 0.5625, "learning_rate": 6.050590219224283e-07, "loss": 0.0006, "reward": 2.5054807662963867, "reward_std": 0.06103113852441311, "rewards/final_reward": 0.9434630830829109, "rewards/mask_iou_reward": 0.47173154154145547, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5054805800318718, "rewards/thk_ans_format_reward": 1.0, "step": 1171, "think_completion_length": 53.90625 }, { "clip_ratio": 0.0, "completion_length": 117.671875, "epoch": 1.9780775716694774, "grad_norm": 7.89724293175771, "kl": 0.498046875, "learning_rate": 6.047217537942663e-07, "loss": 0.0005, "reward": 3.3290287256240845, "reward_std": 0.16214152611792088, "rewards/final_reward": 1.0813684613372112, "rewards/mask_iou_reward": 0.5406842306686056, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.3446537256240845, "rewards/thk_ans_format_reward": 1.0, "step": 1172, "think_completion_length": 42.5 }, { "clip_ratio": 0.0, "completion_length": 120.53125, "epoch": 1.9797639123102866, "grad_norm": 7.723278921818194, "kl": 0.494140625, "learning_rate": 6.043844856661045e-07, "loss": 0.0005, "reward": 3.550881505012512, "reward_std": 0.20284011587500572, "rewards/final_reward": 1.8957957185039027, "rewards/mask_iou_reward": 0.9478978592519514, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5508816242218018, "rewards/thk_ans_format_reward": 1.0, "step": 1173, "think_completion_length": 45.03125 }, { "clip_ratio": 0.0, "completion_length": 115.0, "epoch": 1.9814502529510962, "grad_norm": 13.127430971098534, "kl": 0.57421875, "learning_rate": 6.040472175379426e-07, "loss": 0.0006, "reward": 3.3331719636917114, "reward_std": 0.01610415242612362, "rewards/final_reward": 1.524027746238251, "rewards/mask_iou_reward": 0.7620138731191255, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.333171784877777, "rewards/thk_ans_format_reward": 1.0, "step": 1174, "think_completion_length": 41.03125 }, { "clip_ratio": 0.0, "completion_length": 116.3125, "epoch": 1.9831365935919054, "grad_norm": 17.88970728571287, "kl": 0.4970703125, "learning_rate": 6.037099494097807e-07, "loss": 0.0005, "reward": 3.2474048137664795, "reward_std": 0.12032559514045715, "rewards/final_reward": 1.0106248733440577, "rewards/mask_iou_reward": 0.5053124366720289, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2474048137664795, "rewards/thk_ans_format_reward": 1.0, "step": 1175, "think_completion_length": 40.6875 }, { "clip_ratio": 0.0, "completion_length": 114.09375, "epoch": 1.984822934232715, "grad_norm": 5.86577008258473, "kl": 0.5, "learning_rate": 6.033726812816189e-07, "loss": 0.0005, "reward": 3.3224263191223145, "reward_std": 0.11347942799329758, "rewards/final_reward": 1.2982229708882325, "rewards/mask_iou_reward": 0.6491114854441162, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.322426199913025, "rewards/thk_ans_format_reward": 1.0, "step": 1176, "think_completion_length": 44.375 }, { "clip_ratio": 0.0, "completion_length": 114.15625, "epoch": 1.9865092748735245, "grad_norm": 13.214045562525113, "kl": 0.46875, "learning_rate": 6.03035413153457e-07, "loss": 0.0004, "reward": 3.273212194442749, "reward_std": 0.15405417047441006, "rewards/final_reward": 0.9980004495482484, "rewards/mask_iou_reward": 0.4990002247741242, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2732122540473938, "rewards/thk_ans_format_reward": 1.0, "step": 1177, "think_completion_length": 44.625 }, { "clip_ratio": 0.0, "completion_length": 123.765625, "epoch": 1.9881956155143339, "grad_norm": 7.791891780496249, "kl": 0.50390625, "learning_rate": 6.026981450252951e-07, "loss": 0.0005, "reward": 3.2183451652526855, "reward_std": 0.33510997891426086, "rewards/final_reward": 1.4571303503080095, "rewards/mask_iou_reward": 0.7285651751540048, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2183451652526855, "rewards/thk_ans_format_reward": 1.0, "step": 1178, "think_completion_length": 54.34375 }, { "clip_ratio": 0.0, "completion_length": 106.25, "epoch": 1.9898819561551433, "grad_norm": 6.02650274959712, "kl": 0.501953125, "learning_rate": 6.023608768971333e-07, "loss": 0.0005, "reward": 3.1446791887283325, "reward_std": 0.5253532081842422, "rewards/final_reward": 1.4351317064026219, "rewards/mask_iou_reward": 0.7175658532013109, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.144679307937622, "rewards/thk_ans_format_reward": 1.0, "step": 1179, "think_completion_length": 38.8125 }, { "clip_ratio": 0.0, "completion_length": 121.796875, "epoch": 1.9915682967959527, "grad_norm": 7.805565918911529, "kl": 0.4658203125, "learning_rate": 6.020236087689713e-07, "loss": 0.0005, "reward": 3.2576130628585815, "reward_std": 0.07200890593230724, "rewards/final_reward": 0.6822467371243718, "rewards/mask_iou_reward": 0.3411233685621859, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.257613182067871, "rewards/thk_ans_format_reward": 1.0, "step": 1180, "think_completion_length": 46.03125 }, { "clip_ratio": 0.0, "completion_length": 118.203125, "epoch": 1.9932546374367623, "grad_norm": 8.633385502886387, "kl": 0.4501953125, "learning_rate": 6.016863406408093e-07, "loss": 0.0004, "reward": 3.324090003967285, "reward_std": 0.198069479316473, "rewards/final_reward": 1.0850171364270256, "rewards/mask_iou_reward": 0.5425085682135128, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3240899443626404, "rewards/thk_ans_format_reward": 1.0, "step": 1181, "think_completion_length": 46.90625 }, { "clip_ratio": 0.0, "completion_length": 115.765625, "epoch": 1.9949409780775715, "grad_norm": 4.1814855348800295, "kl": 0.4853515625, "learning_rate": 6.013490725126475e-07, "loss": 0.0005, "reward": 3.1244304180145264, "reward_std": 0.0062828969093970954, "rewards/final_reward": 0.4318805218001468, "rewards/mask_iou_reward": 0.2159402609000734, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1244304180145264, "rewards/thk_ans_format_reward": 1.0, "step": 1182, "think_completion_length": 46.0625 }, { "clip_ratio": 0.0, "completion_length": 115.8125, "epoch": 1.9966273187183812, "grad_norm": 12.505994574842456, "kl": 0.5107421875, "learning_rate": 6.010118043844856e-07, "loss": 0.0005, "reward": 2.967620015144348, "reward_std": 0.14106937497854233, "rewards/final_reward": 0.7828099518289345, "rewards/mask_iou_reward": 0.39140497591446727, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9676199853420258, "rewards/thk_ans_format_reward": 1.0, "step": 1183, "think_completion_length": 45.125 }, { "clip_ratio": 0.0, "completion_length": 118.66666793823242, "epoch": 1.9983136593591906, "grad_norm": 75.3561607357473, "kl": 0.4482421875, "learning_rate": 6.006745362563238e-07, "loss": 0.0004, "reward": 3.516904592514038, "reward_std": 0.023798184003680944, "rewards/final_reward": 1.5131991709008814, "rewards/mask_iou_reward": 0.7565995854504407, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5169046521186829, "rewards/thk_ans_format_reward": 1.0, "step": 1184, "think_completion_length": 47.46875 }, { "clip_ratio": 0.0, "completion_length": 116.625, "epoch": 2.0016863406408096, "grad_norm": 13.48689102759712, "kl": 0.49609375, "learning_rate": 6.003372681281619e-07, "loss": 0.0005, "reward": 2.9429726600646973, "reward_std": 0.23621351923793554, "rewards/final_reward": 0.49316087068991743, "rewards/mask_iou_reward": 0.24658043534495871, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9429725408554077, "rewards/thk_ans_format_reward": 1.0, "step": 1185, "think_completion_length": 48.65625 }, { "clip_ratio": 0.0, "completion_length": 111.4375, "epoch": 2.003372681281619, "grad_norm": 8.146315588376712, "kl": 0.52734375, "learning_rate": 6e-07, "loss": 0.0005, "reward": 3.3868921995162964, "reward_std": 0.21877353638410568, "rewards/final_reward": 1.0489110172147114, "rewards/mask_iou_reward": 0.5244555086073557, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3868921399116516, "rewards/thk_ans_format_reward": 1.0, "step": 1186, "think_completion_length": 43.0625 }, { "clip_ratio": 0.0, "completion_length": 115.171875, "epoch": 2.0050590219224285, "grad_norm": 14.54303439688115, "kl": 0.4765625, "learning_rate": 5.996627318718382e-07, "loss": 0.0005, "reward": 3.432328224182129, "reward_std": 0.12002099305391312, "rewards/final_reward": 1.6099766606977879, "rewards/mask_iou_reward": 0.8049883303488939, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4323282837867737, "rewards/thk_ans_format_reward": 1.0, "step": 1187, "think_completion_length": 49.65625 }, { "clip_ratio": 0.0, "completion_length": 119.875, "epoch": 2.0067453625632377, "grad_norm": 5.788675497399444, "kl": 0.4755859375, "learning_rate": 5.993254637436763e-07, "loss": 0.0005, "reward": 3.4901273250579834, "reward_std": 0.198878675699234, "rewards/final_reward": 1.3744566459246905, "rewards/mask_iou_reward": 0.6872283229623453, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4901273250579834, "rewards/thk_ans_format_reward": 1.0, "step": 1188, "think_completion_length": 42.9375 }, { "clip_ratio": 0.0, "completion_length": 119.140625, "epoch": 2.0084317032040473, "grad_norm": 4.644312190596549, "kl": 0.515625, "learning_rate": 5.989881956155142e-07, "loss": 0.0005, "reward": 2.759495258331299, "reward_std": 0.20145833492279053, "rewards/final_reward": 0.5538062995088217, "rewards/mask_iou_reward": 0.2769031497544108, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7594953179359436, "rewards/thk_ans_format_reward": 1.0, "step": 1189, "think_completion_length": 46.65625 }, { "clip_ratio": 0.0, "completion_length": 115.265625, "epoch": 2.0101180438448565, "grad_norm": 7.67349455004292, "kl": 0.498046875, "learning_rate": 5.986509274873524e-07, "loss": 0.0005, "reward": 3.634737968444824, "reward_std": 0.20400644093751907, "rewards/final_reward": 1.9044805719221032, "rewards/mask_iou_reward": 0.9522402859610516, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.634738028049469, "rewards/thk_ans_format_reward": 1.0, "step": 1190, "think_completion_length": 41.0 }, { "clip_ratio": 0.0, "completion_length": 115.859375, "epoch": 2.011804384485666, "grad_norm": 8.301429030110507, "kl": 0.48828125, "learning_rate": 5.983136593591905e-07, "loss": 0.0005, "reward": 3.4187628030776978, "reward_std": 0.2773596942424774, "rewards/final_reward": 1.4393433284356094, "rewards/mask_iou_reward": 0.7196716642178047, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.418762981891632, "rewards/thk_ans_format_reward": 1.0, "step": 1191, "think_completion_length": 45.28125 }, { "clip_ratio": 0.0, "completion_length": 111.03125, "epoch": 2.0134907251264758, "grad_norm": 6.439932075838745, "kl": 0.51171875, "learning_rate": 5.979763912310286e-07, "loss": 0.0005, "reward": 3.4730443954467773, "reward_std": 0.04087753966450691, "rewards/final_reward": 1.5909392846397068, "rewards/mask_iou_reward": 0.7954696423198534, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.473044514656067, "rewards/thk_ans_format_reward": 1.0, "step": 1192, "think_completion_length": 40.40625 }, { "clip_ratio": 0.0, "completion_length": 113.90625, "epoch": 2.015177065767285, "grad_norm": 8.227312462160738, "kl": 0.4912109375, "learning_rate": 5.976391231028668e-07, "loss": 0.0005, "reward": 3.2172244787216187, "reward_std": 0.18251924961805344, "rewards/final_reward": 1.179570640819125, "rewards/mask_iou_reward": 0.5897853204095626, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2172245979309082, "rewards/thk_ans_format_reward": 1.0, "step": 1193, "think_completion_length": 46.25 }, { "clip_ratio": 0.0, "completion_length": 116.03125, "epoch": 2.0168634064080946, "grad_norm": 8.540116550794291, "kl": 0.509765625, "learning_rate": 5.973018549747049e-07, "loss": 0.0005, "reward": 2.839731454849243, "reward_std": 0.11079123802483082, "rewards/final_reward": 1.239055328482462, "rewards/mask_iou_reward": 0.619527664241231, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8397313356399536, "rewards/thk_ans_format_reward": 1.0, "step": 1194, "think_completion_length": 39.78125 }, { "clip_ratio": 0.0, "completion_length": 114.734375, "epoch": 2.0185497470489038, "grad_norm": 5.567108294211803, "kl": 0.4599609375, "learning_rate": 5.96964586846543e-07, "loss": 0.0005, "reward": 3.490329623222351, "reward_std": 0.0630667507648468, "rewards/final_reward": 1.4267507476745882, "rewards/mask_iou_reward": 0.7133753738372941, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4903295636177063, "rewards/thk_ans_format_reward": 1.0, "step": 1195, "think_completion_length": 39.21875 }, { "clip_ratio": 0.0, "completion_length": 114.015625, "epoch": 2.0202360876897134, "grad_norm": 7.514581447833132, "kl": 0.4443359375, "learning_rate": 5.966273187183812e-07, "loss": 0.0005, "reward": 3.437077522277832, "reward_std": 0.33910364657640457, "rewards/final_reward": 1.6282311242662053, "rewards/mask_iou_reward": 0.8141155621331027, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4370774626731873, "rewards/thk_ans_format_reward": 1.0, "step": 1196, "think_completion_length": 44.3125 }, { "clip_ratio": 0.0, "completion_length": 114.875, "epoch": 2.0219224283305226, "grad_norm": 9.150943576049919, "kl": 0.529296875, "learning_rate": 5.962900505902191e-07, "loss": 0.0005, "reward": 3.362897038459778, "reward_std": 0.18507951498031616, "rewards/final_reward": 1.7366566859495336, "rewards/mask_iou_reward": 0.8683283429747668, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3628970384597778, "rewards/thk_ans_format_reward": 1.0, "step": 1197, "think_completion_length": 43.46875 }, { "clip_ratio": 0.0, "completion_length": 117.5, "epoch": 2.0236087689713322, "grad_norm": 6.336774089846376, "kl": 0.501953125, "learning_rate": 5.959527824620572e-07, "loss": 0.0005, "reward": 3.3588712215423584, "reward_std": 0.18687545135617256, "rewards/final_reward": 1.085339322650293, "rewards/mask_iou_reward": 0.5426696613251465, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3588712215423584, "rewards/thk_ans_format_reward": 1.0, "step": 1198, "think_completion_length": 47.65625 }, { "clip_ratio": 0.0, "completion_length": 112.921875, "epoch": 2.0252951096121414, "grad_norm": 10.164923995181558, "kl": 0.54296875, "learning_rate": 5.956155143338954e-07, "loss": 0.0005, "reward": 2.4154281616210938, "reward_std": 0.11479150131344795, "rewards/final_reward": 0.18002585004279, "rewards/mask_iou_reward": 0.090012925021395, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.41542813181877136, "rewards/thk_ans_format_reward": 1.0, "step": 1199, "think_completion_length": 43.53125 }, { "clip_ratio": 0.0, "completion_length": 118.203125, "epoch": 2.026981450252951, "grad_norm": 6.859588749252243, "kl": 0.4638671875, "learning_rate": 5.952782462057335e-07, "loss": 0.0005, "reward": 3.202408790588379, "reward_std": 0.0297409575432539, "rewards/final_reward": 1.5953461359410652, "rewards/mask_iou_reward": 0.7976730679705326, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2024087607860565, "rewards/thk_ans_format_reward": 1.0, "step": 1200, "think_completion_length": 45.09375 }, { "clip_ratio": 0.0, "completion_length": 112.0, "epoch": 2.0286677908937607, "grad_norm": 10.968961604711549, "kl": 0.5859375, "learning_rate": 5.949409780775716e-07, "loss": 0.0006, "reward": 2.9323331117630005, "reward_std": 0.06923278025351465, "rewards/final_reward": 0.7952303924692466, "rewards/mask_iou_reward": 0.3976151962346233, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9323331415653229, "rewards/thk_ans_format_reward": 1.0, "step": 1201, "think_completion_length": 40.875 }, { "clip_ratio": 0.0, "completion_length": 115.4375, "epoch": 2.03035413153457, "grad_norm": 5.190037758988362, "kl": 0.556640625, "learning_rate": 5.946037099494098e-07, "loss": 0.0005, "reward": 2.9672462940216064, "reward_std": 0.06309534143656492, "rewards/final_reward": 1.3002051463077566, "rewards/mask_iou_reward": 0.6501025731538783, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9672463536262512, "rewards/thk_ans_format_reward": 1.0, "step": 1202, "think_completion_length": 47.5 }, { "clip_ratio": 0.0, "completion_length": 116.46875, "epoch": 2.0320404721753795, "grad_norm": 48.15014122950417, "kl": 0.5029296875, "learning_rate": 5.942664418212479e-07, "loss": 0.0005, "reward": 3.2930922508239746, "reward_std": 0.1110190600156784, "rewards/final_reward": 1.0016735899585203, "rewards/mask_iou_reward": 0.5008367949792601, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2930924892425537, "rewards/thk_ans_format_reward": 1.0, "step": 1203, "think_completion_length": 48.65625 }, { "clip_ratio": 0.0, "completion_length": 117.59375, "epoch": 2.0337268128161887, "grad_norm": 12.588029094533338, "kl": 0.5029296875, "learning_rate": 5.93929173693086e-07, "loss": 0.0005, "reward": 3.519856095314026, "reward_std": 0.05847097374498844, "rewards/final_reward": 1.4558192890655186, "rewards/mask_iou_reward": 0.7279096445327593, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5198561549186707, "rewards/thk_ans_format_reward": 1.0, "step": 1204, "think_completion_length": 47.65625 }, { "clip_ratio": 0.0, "completion_length": 113.453125, "epoch": 2.0354131534569984, "grad_norm": 3.8998339647410987, "kl": 0.55078125, "learning_rate": 5.935919055649242e-07, "loss": 0.0005, "reward": 2.5957932472229004, "reward_std": 0.07152135111391544, "rewards/final_reward": 0.6404311645514321, "rewards/mask_iou_reward": 0.32021558227571606, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5957932770252228, "rewards/thk_ans_format_reward": 1.0, "step": 1205, "think_completion_length": 49.5625 }, { "clip_ratio": 0.0, "completion_length": 117.9375, "epoch": 2.0370994940978076, "grad_norm": 9.934504424693726, "kl": 0.5078125, "learning_rate": 5.932546374367621e-07, "loss": 0.0005, "reward": 2.9279143810272217, "reward_std": 0.1831659022718668, "rewards/final_reward": 1.4839509979409318, "rewards/mask_iou_reward": 0.7419754989704659, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9279144108295441, "rewards/thk_ans_format_reward": 1.0, "step": 1206, "think_completion_length": 44.15625 }, { "clip_ratio": 0.0, "completion_length": 117.625, "epoch": 2.038785834738617, "grad_norm": 6.8848677535750955, "kl": 0.513671875, "learning_rate": 5.929173693086002e-07, "loss": 0.0005, "reward": 3.8089691400527954, "reward_std": 0.10258529148995876, "rewards/final_reward": 1.7815592827991056, "rewards/mask_iou_reward": 0.8907796413995528, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8089691400527954, "rewards/thk_ans_format_reward": 1.0, "step": 1207, "think_completion_length": 50.34375 }, { "clip_ratio": 0.0, "completion_length": 110.09375, "epoch": 2.040472175379427, "grad_norm": 10.670956222703316, "kl": 0.544921875, "learning_rate": 5.925801011804384e-07, "loss": 0.0005, "reward": 3.71175754070282, "reward_std": 0.09188080579042435, "rewards/final_reward": 1.552977126938897, "rewards/mask_iou_reward": 0.7764885634694485, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7117576003074646, "rewards/thk_ans_format_reward": 1.0, "step": 1208, "think_completion_length": 43.125 }, { "clip_ratio": 0.0, "completion_length": 115.171875, "epoch": 2.042158516020236, "grad_norm": 5.359893518780748, "kl": 0.63671875, "learning_rate": 5.922428330522765e-07, "loss": 0.0006, "reward": 3.751399278640747, "reward_std": 0.13887044158764184, "rewards/final_reward": 1.795235393339603, "rewards/mask_iou_reward": 0.8976176966698015, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.751399278640747, "rewards/thk_ans_format_reward": 1.0, "step": 1209, "think_completion_length": 45.75 }, { "clip_ratio": 0.0, "completion_length": 114.359375, "epoch": 2.0438448566610457, "grad_norm": 5.770311558844356, "kl": 0.533203125, "learning_rate": 5.919055649241147e-07, "loss": 0.0005, "reward": 3.644760251045227, "reward_std": 0.05879488307982683, "rewards/final_reward": 1.622706724028792, "rewards/mask_iou_reward": 0.811353362014396, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6447601914405823, "rewards/thk_ans_format_reward": 1.0, "step": 1210, "think_completion_length": 47.9375 }, { "clip_ratio": 0.0, "completion_length": 112.375, "epoch": 2.045531197301855, "grad_norm": 20.427617565995828, "kl": 0.49609375, "learning_rate": 5.915682967959528e-07, "loss": 0.0005, "reward": 3.641817569732666, "reward_std": 0.05982916243374348, "rewards/final_reward": 1.7792524175183502, "rewards/mask_iou_reward": 0.8896262087591751, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6418176293373108, "rewards/thk_ans_format_reward": 1.0, "step": 1211, "think_completion_length": 42.5625 }, { "clip_ratio": 0.0, "completion_length": 117.328125, "epoch": 2.0472175379426645, "grad_norm": 25.937607637845336, "kl": 0.501953125, "learning_rate": 5.912310286677909e-07, "loss": 0.0005, "reward": 3.0757253170013428, "reward_std": 0.22424892336130142, "rewards/final_reward": 1.3770070944798838, "rewards/mask_iou_reward": 0.6885035472399419, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0757253766059875, "rewards/thk_ans_format_reward": 1.0, "step": 1212, "think_completion_length": 46.53125 }, { "clip_ratio": 0.0, "completion_length": 156.796875, "epoch": 2.0489038785834737, "grad_norm": 7.261303365634824, "kl": 0.451171875, "learning_rate": 5.908937605396291e-07, "loss": 0.0005, "reward": 3.3862454891204834, "reward_std": 0.09621530398726463, "rewards/final_reward": 1.1227074698332027, "rewards/mask_iou_reward": 0.5613537349166013, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3862455487251282, "rewards/thk_ans_format_reward": 1.0, "step": 1213, "think_completion_length": 48.59375 }, { "clip_ratio": 0.0, "completion_length": 119.578125, "epoch": 2.0505902192242833, "grad_norm": 6.27675477947036, "kl": 0.50390625, "learning_rate": 5.905564924114671e-07, "loss": 0.0005, "reward": 3.8144431114196777, "reward_std": 0.050515939481556416, "rewards/final_reward": 1.823427526112407, "rewards/mask_iou_reward": 0.9117137630562036, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8144429326057434, "rewards/thk_ans_format_reward": 1.0, "step": 1214, "think_completion_length": 52.21875 }, { "clip_ratio": 0.0, "completion_length": 115.453125, "epoch": 2.052276559865093, "grad_norm": 15.280398429391834, "kl": 0.490234375, "learning_rate": 5.902192242833051e-07, "loss": 0.0005, "reward": 3.0774126052856445, "reward_std": 0.22712376341223717, "rewards/final_reward": 1.1759968898950153, "rewards/mask_iou_reward": 0.5879984449475076, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.077412635087967, "rewards/thk_ans_format_reward": 1.0, "step": 1215, "think_completion_length": 45.75 }, { "clip_ratio": 0.0, "completion_length": 112.171875, "epoch": 2.053962900505902, "grad_norm": 25.440565612456673, "kl": 0.4755859375, "learning_rate": 5.898819561551433e-07, "loss": 0.0005, "reward": 3.495232939720154, "reward_std": 0.16499481350183487, "rewards/final_reward": 1.3941976278260753, "rewards/mask_iou_reward": 0.6970988139130376, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.495232880115509, "rewards/thk_ans_format_reward": 1.0, "step": 1216, "think_completion_length": 43.3125 }, { "clip_ratio": 0.0, "completion_length": 135.734375, "epoch": 2.0556492411467118, "grad_norm": 19.52427442013993, "kl": 0.568359375, "learning_rate": 5.895446880269814e-07, "loss": 0.0006, "reward": 3.5315277576446533, "reward_std": 0.09806636191206053, "rewards/final_reward": 1.7410238492657086, "rewards/mask_iou_reward": 0.8705119246328543, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5315275192260742, "rewards/thk_ans_format_reward": 1.0, "step": 1217, "think_completion_length": 43.59375 }, { "clip_ratio": 0.0, "completion_length": 114.78125, "epoch": 2.057335581787521, "grad_norm": 7.077278333938166, "kl": 0.48828125, "learning_rate": 5.892074198988195e-07, "loss": 0.0005, "reward": 3.1986727714538574, "reward_std": 0.15416544303297997, "rewards/final_reward": 0.526140168463072, "rewards/mask_iou_reward": 0.263070084231536, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1986727714538574, "rewards/thk_ans_format_reward": 1.0, "step": 1218, "think_completion_length": 45.4375 }, { "clip_ratio": 0.0, "completion_length": 119.59375, "epoch": 2.0590219224283306, "grad_norm": 12.056694496829845, "kl": 0.513671875, "learning_rate": 5.888701517706577e-07, "loss": 0.0005, "reward": 3.175121545791626, "reward_std": 0.24305326491594315, "rewards/final_reward": 1.5874991299200971, "rewards/mask_iou_reward": 0.7937495649600486, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1751216650009155, "rewards/thk_ans_format_reward": 1.0, "step": 1219, "think_completion_length": 50.78125 }, { "clip_ratio": 0.0, "completion_length": 113.640625, "epoch": 2.06070826306914, "grad_norm": 8.927632556503001, "kl": 0.5234375, "learning_rate": 5.885328836424958e-07, "loss": 0.0005, "reward": 3.10360050201416, "reward_std": 0.3249269500374794, "rewards/final_reward": 0.7365527669777219, "rewards/mask_iou_reward": 0.36827638348886094, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1036004424095154, "rewards/thk_ans_format_reward": 1.0, "step": 1220, "think_completion_length": 39.03125 }, { "clip_ratio": 0.0, "completion_length": 111.234375, "epoch": 2.0623946037099494, "grad_norm": 6.797062459675585, "kl": 0.482421875, "learning_rate": 5.881956155143339e-07, "loss": 0.0005, "reward": 2.975393295288086, "reward_std": 0.085931153036654, "rewards/final_reward": 1.4435219891230973, "rewards/mask_iou_reward": 0.7217609945615486, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9753932952880859, "rewards/thk_ans_format_reward": 1.0, "step": 1221, "think_completion_length": 42.15625 }, { "clip_ratio": 0.0, "completion_length": 117.265625, "epoch": 2.064080944350759, "grad_norm": 25.28742553611422, "kl": 0.505859375, "learning_rate": 5.87858347386172e-07, "loss": 0.0005, "reward": 3.0978381633758545, "reward_std": 0.2666756585240364, "rewards/final_reward": 1.2287549133668487, "rewards/mask_iou_reward": 0.6143774566834244, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0978381633758545, "rewards/thk_ans_format_reward": 1.0, "step": 1222, "think_completion_length": 48.21875 }, { "clip_ratio": 0.0, "completion_length": 116.71875, "epoch": 2.0657672849915683, "grad_norm": 9.439762606777863, "kl": 0.52734375, "learning_rate": 5.875210792580101e-07, "loss": 0.0005, "reward": 3.324112296104431, "reward_std": 0.16529548168182373, "rewards/final_reward": 1.195829944225152, "rewards/mask_iou_reward": 0.597914972112576, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.324112355709076, "rewards/thk_ans_format_reward": 1.0, "step": 1223, "think_completion_length": 45.53125 }, { "clip_ratio": 0.0, "completion_length": 120.921875, "epoch": 2.067453625632378, "grad_norm": 11.069853275445752, "kl": 0.546875, "learning_rate": 5.871838111298481e-07, "loss": 0.0005, "reward": 3.2145529985427856, "reward_std": 0.12553077191114426, "rewards/final_reward": 1.1087564448687577, "rewards/mask_iou_reward": 0.5543782224343788, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.214552879333496, "rewards/thk_ans_format_reward": 1.0, "step": 1224, "think_completion_length": 50.5625 }, { "clip_ratio": 0.0, "completion_length": 115.6875, "epoch": 2.069139966273187, "grad_norm": 6.586658803173453, "kl": 0.5498046875, "learning_rate": 5.868465430016863e-07, "loss": 0.0006, "reward": 3.655348539352417, "reward_std": 0.14335137605667114, "rewards/final_reward": 1.6976510726235734, "rewards/mask_iou_reward": 0.8488255363117867, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.655348539352417, "rewards/thk_ans_format_reward": 1.0, "step": 1225, "think_completion_length": 43.75 }, { "clip_ratio": 0.0, "completion_length": 120.171875, "epoch": 2.0708263069139967, "grad_norm": 7.341675970877156, "kl": 0.509765625, "learning_rate": 5.865092748735244e-07, "loss": 0.0005, "reward": 3.4260720014572144, "reward_std": 0.38464102149009705, "rewards/final_reward": 1.2654199398354669, "rewards/mask_iou_reward": 0.6327099699177334, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.4416970610618591, "rewards/thk_ans_format_reward": 1.0, "step": 1226, "think_completion_length": 48.71875 }, { "clip_ratio": 0.0, "completion_length": 117.03125, "epoch": 2.072512647554806, "grad_norm": 5.922152295385602, "kl": 0.572265625, "learning_rate": 5.861720067453625e-07, "loss": 0.0006, "reward": 3.173344850540161, "reward_std": 0.0615835078060627, "rewards/final_reward": 1.1209231989245945, "rewards/mask_iou_reward": 0.5604615994622972, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1733447909355164, "rewards/thk_ans_format_reward": 1.0, "step": 1227, "think_completion_length": 46.59375 }, { "clip_ratio": 0.0, "completion_length": 120.953125, "epoch": 2.0741989881956155, "grad_norm": 7.530184997434515, "kl": 0.486328125, "learning_rate": 5.858347386172007e-07, "loss": 0.0005, "reward": 3.558689832687378, "reward_std": 0.06893946789205074, "rewards/final_reward": 1.4888971286790864, "rewards/mask_iou_reward": 0.7444485643395432, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5586897134780884, "rewards/thk_ans_format_reward": 1.0, "step": 1228, "think_completion_length": 46.21875 }, { "clip_ratio": 0.0, "completion_length": 115.078125, "epoch": 2.075885328836425, "grad_norm": 7.809389484619535, "kl": 0.505859375, "learning_rate": 5.854974704890388e-07, "loss": 0.0005, "reward": 2.6829360723495483, "reward_std": 0.31088511645793915, "rewards/final_reward": 0.8371374268188949, "rewards/mask_iou_reward": 0.41856871340944746, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6829361021518707, "rewards/thk_ans_format_reward": 1.0, "step": 1229, "think_completion_length": 44.4375 }, { "clip_ratio": 0.0, "completion_length": 114.046875, "epoch": 2.0775716694772344, "grad_norm": 5.389053395617888, "kl": 0.54296875, "learning_rate": 5.851602023608768e-07, "loss": 0.0005, "reward": 3.397468686103821, "reward_std": 0.11730508506298065, "rewards/final_reward": 1.1659759993446794, "rewards/mask_iou_reward": 0.5829879996723397, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3974686861038208, "rewards/thk_ans_format_reward": 1.0, "step": 1230, "think_completion_length": 40.09375 }, { "clip_ratio": 0.0, "completion_length": 118.890625, "epoch": 2.079258010118044, "grad_norm": 18.292579630418203, "kl": 0.515625, "learning_rate": 5.84822934232715e-07, "loss": 0.0005, "reward": 3.2734872102737427, "reward_std": 0.16774200648069382, "rewards/final_reward": 1.4035551535338837, "rewards/mask_iou_reward": 0.7017775767669419, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2734872102737427, "rewards/thk_ans_format_reward": 1.0, "step": 1231, "think_completion_length": 53.71875 }, { "clip_ratio": 0.0, "completion_length": 123.515625, "epoch": 2.080944350758853, "grad_norm": 9.232430436849269, "kl": 0.50390625, "learning_rate": 5.84485666104553e-07, "loss": 0.0005, "reward": 3.0458027124404907, "reward_std": 0.21156837791204453, "rewards/final_reward": 1.0424462781126267, "rewards/mask_iou_reward": 0.5212231390563133, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0458028018474579, "rewards/thk_ans_format_reward": 1.0, "step": 1232, "think_completion_length": 49.59375 }, { "clip_ratio": 0.0, "completion_length": 114.1875, "epoch": 2.082630691399663, "grad_norm": 11.183935733331714, "kl": 0.4892578125, "learning_rate": 5.841483979763911e-07, "loss": 0.0005, "reward": 3.0968226194381714, "reward_std": 0.33744121342897415, "rewards/final_reward": 1.1910839599885503, "rewards/mask_iou_reward": 0.5955419799942752, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0968225598335266, "rewards/thk_ans_format_reward": 1.0, "step": 1233, "think_completion_length": 52.25 }, { "clip_ratio": 0.0, "completion_length": 119.015625, "epoch": 2.084317032040472, "grad_norm": 11.91292635625383, "kl": 0.5234375, "learning_rate": 5.838111298482293e-07, "loss": 0.0005, "reward": 3.551935076713562, "reward_std": 0.10081242024898529, "rewards/final_reward": 1.4946258071000098, "rewards/mask_iou_reward": 0.7473129035500049, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.551935076713562, "rewards/thk_ans_format_reward": 1.0, "step": 1234, "think_completion_length": 47.6875 }, { "clip_ratio": 0.0, "completion_length": 111.609375, "epoch": 2.0860033726812817, "grad_norm": 7.440830997494622, "kl": 0.546875, "learning_rate": 5.834738617200674e-07, "loss": 0.0006, "reward": 3.7355228662490845, "reward_std": 0.10745073819998652, "rewards/final_reward": 1.9838178236248292, "rewards/mask_iou_reward": 0.9919089118124146, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7355228066444397, "rewards/thk_ans_format_reward": 1.0, "step": 1235, "think_completion_length": 50.46875 }, { "clip_ratio": 0.0, "completion_length": 120.078125, "epoch": 2.087689713322091, "grad_norm": 7.373616608509708, "kl": 0.484375, "learning_rate": 5.831365935919056e-07, "loss": 0.0005, "reward": 3.488574981689453, "reward_std": 0.03398977406322956, "rewards/final_reward": 1.6369875219333916, "rewards/mask_iou_reward": 0.8184937609666958, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4885749220848083, "rewards/thk_ans_format_reward": 1.0, "step": 1236, "think_completion_length": 46.59375 }, { "clip_ratio": 0.0, "completion_length": 121.8125, "epoch": 2.0893760539629005, "grad_norm": 9.540775863697288, "kl": 0.6328125, "learning_rate": 5.827993254637437e-07, "loss": 0.0006, "reward": 3.630674719810486, "reward_std": 0.3000973165035248, "rewards/final_reward": 1.5710772669419548, "rewards/mask_iou_reward": 0.7855386334709774, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6306747198104858, "rewards/thk_ans_format_reward": 1.0, "step": 1237, "think_completion_length": 49.375 }, { "clip_ratio": 0.0, "completion_length": 122.53125, "epoch": 2.09106239460371, "grad_norm": 10.590781360485371, "kl": 0.6044921875, "learning_rate": 5.824620573355818e-07, "loss": 0.0006, "reward": 3.2602202892303467, "reward_std": 0.03827218525111675, "rewards/final_reward": 1.7720339794504287, "rewards/mask_iou_reward": 0.8860169897252144, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.260220319032669, "rewards/thk_ans_format_reward": 1.0, "step": 1238, "think_completion_length": 49.28125 }, { "clip_ratio": 0.0, "completion_length": 125.703125, "epoch": 2.0927487352445193, "grad_norm": 16.82641766082157, "kl": 0.5390625, "learning_rate": 5.821247892074199e-07, "loss": 0.0005, "reward": 3.7846381664276123, "reward_std": 0.09372981078922749, "rewards/final_reward": 1.7675882258712055, "rewards/mask_iou_reward": 0.8837941129356027, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7846381068229675, "rewards/thk_ans_format_reward": 1.0, "step": 1239, "think_completion_length": 56.09375 }, { "clip_ratio": 0.0, "completion_length": 112.328125, "epoch": 2.094435075885329, "grad_norm": 8.521619117103281, "kl": 0.533203125, "learning_rate": 5.81787521079258e-07, "loss": 0.0005, "reward": 3.1291738748550415, "reward_std": 0.06243935413658619, "rewards/final_reward": 0.5839306049119234, "rewards/mask_iou_reward": 0.2919653024559617, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1291736364364624, "rewards/thk_ans_format_reward": 1.0, "step": 1240, "think_completion_length": 44.03125 }, { "clip_ratio": 0.0, "completion_length": 120.875, "epoch": 2.096121416526138, "grad_norm": 3.4547751255274695, "kl": 0.51953125, "learning_rate": 5.81450252951096e-07, "loss": 0.0005, "reward": 3.304155707359314, "reward_std": 0.009104110868065618, "rewards/final_reward": 1.6302786885188945, "rewards/mask_iou_reward": 0.8151393442594472, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3041555881500244, "rewards/thk_ans_format_reward": 1.0, "step": 1241, "think_completion_length": 54.375 }, { "clip_ratio": 0.0, "completion_length": 137.375, "epoch": 2.097807757166948, "grad_norm": 12.224812085536941, "kl": 0.48046875, "learning_rate": 5.811129848229342e-07, "loss": 0.0005, "reward": 3.108389377593994, "reward_std": 0.2590280845761299, "rewards/final_reward": 1.6332673514867977, "rewards/mask_iou_reward": 0.8166336757433988, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1083892583847046, "rewards/thk_ans_format_reward": 1.0, "step": 1242, "think_completion_length": 48.53125 }, { "clip_ratio": 0.0, "completion_length": 121.140625, "epoch": 2.099494097807757, "grad_norm": 12.419841778007031, "kl": 0.462890625, "learning_rate": 5.807757166947723e-07, "loss": 0.0005, "reward": 3.318019986152649, "reward_std": 0.0800204686820507, "rewards/final_reward": 1.2042707380623143, "rewards/mask_iou_reward": 0.6021353690311572, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3180198669433594, "rewards/thk_ans_format_reward": 1.0, "step": 1243, "think_completion_length": 50.875 }, { "clip_ratio": 0.0, "completion_length": 115.75, "epoch": 2.1011804384485666, "grad_norm": 7.57509628016737, "kl": 0.556640625, "learning_rate": 5.804384485666104e-07, "loss": 0.0006, "reward": 3.264046311378479, "reward_std": 0.0792916975915432, "rewards/final_reward": 1.637747794359674, "rewards/mask_iou_reward": 0.818873897179837, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2640464305877686, "rewards/thk_ans_format_reward": 1.0, "step": 1244, "think_completion_length": 44.90625 }, { "clip_ratio": 0.0, "completion_length": 131.625, "epoch": 2.1028667790893762, "grad_norm": 46.38176561705908, "kl": 0.4931640625, "learning_rate": 5.801011804384486e-07, "loss": 0.0005, "reward": 2.916468858718872, "reward_std": 0.31557588279247284, "rewards/final_reward": 1.3054415203274055, "rewards/mask_iou_reward": 0.6527207601637027, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9164688587188721, "rewards/thk_ans_format_reward": 1.0, "step": 1245, "think_completion_length": 44.6875 }, { "clip_ratio": 0.0, "completion_length": 118.171875, "epoch": 2.1045531197301854, "grad_norm": 21.665840127244977, "kl": 0.5, "learning_rate": 5.797639123102867e-07, "loss": 0.0005, "reward": 3.4545528888702393, "reward_std": 0.17094121873378754, "rewards/final_reward": 1.7188712109586382, "rewards/mask_iou_reward": 0.8594356054793191, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4545528292655945, "rewards/thk_ans_format_reward": 1.0, "step": 1246, "think_completion_length": 45.90625 }, { "clip_ratio": 0.0, "completion_length": 136.578125, "epoch": 2.106239460370995, "grad_norm": 32.64713621094739, "kl": 0.4765625, "learning_rate": 5.794266441821247e-07, "loss": 0.0005, "reward": 3.6517735719680786, "reward_std": 0.10199225321412086, "rewards/final_reward": 1.5091516226489148, "rewards/mask_iou_reward": 0.7545758113244574, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6517733931541443, "rewards/thk_ans_format_reward": 1.0, "step": 1247, "think_completion_length": 48.0 }, { "clip_ratio": 0.0, "completion_length": 119.484375, "epoch": 2.1079258010118043, "grad_norm": 7.1967143913972, "kl": 0.48046875, "learning_rate": 5.790893760539629e-07, "loss": 0.0005, "reward": 3.353150963783264, "reward_std": 0.11774658411741257, "rewards/final_reward": 1.748703370024331, "rewards/mask_iou_reward": 0.8743516850121655, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3531509041786194, "rewards/thk_ans_format_reward": 1.0, "step": 1248, "think_completion_length": 45.25 }, { "clip_ratio": 0.0, "completion_length": 122.78125, "epoch": 2.109612141652614, "grad_norm": 55.73109806211543, "kl": 0.509765625, "learning_rate": 5.78752107925801e-07, "loss": 0.0005, "reward": 3.6747478246688843, "reward_std": 0.10688724555075169, "rewards/final_reward": 1.561100560613465, "rewards/mask_iou_reward": 0.7805502803067325, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6747477650642395, "rewards/thk_ans_format_reward": 1.0, "step": 1249, "think_completion_length": 43.9375 }, { "clip_ratio": 0.0, "completion_length": 121.78125, "epoch": 2.111298482293423, "grad_norm": 10.108456421636822, "kl": 0.51171875, "learning_rate": 5.78414839797639e-07, "loss": 0.0005, "reward": 3.0707567930221558, "reward_std": 0.19155866652727127, "rewards/final_reward": 0.6565987376431796, "rewards/mask_iou_reward": 0.3282993688215898, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0707568526268005, "rewards/thk_ans_format_reward": 1.0, "step": 1250, "think_completion_length": 53.8125 }, { "clip_ratio": 0.0, "completion_length": 124.09375, "epoch": 2.1129848229342327, "grad_norm": 13.009707743391786, "kl": 0.484375, "learning_rate": 5.780775716694772e-07, "loss": 0.0005, "reward": 3.369237780570984, "reward_std": 0.17601485550403595, "rewards/final_reward": 1.2133884638153491, "rewards/mask_iou_reward": 0.6066942319076746, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3692377805709839, "rewards/thk_ans_format_reward": 1.0, "step": 1251, "think_completion_length": 46.90625 }, { "clip_ratio": 0.0, "completion_length": 109.046875, "epoch": 2.1146711635750424, "grad_norm": 6.729008572861788, "kl": 0.4697265625, "learning_rate": 5.777403035413153e-07, "loss": 0.0005, "reward": 3.363896131515503, "reward_std": 0.3527638018131256, "rewards/final_reward": 1.2416063426888746, "rewards/mask_iou_reward": 0.6208031713444373, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3638960123062134, "rewards/thk_ans_format_reward": 1.0, "step": 1252, "think_completion_length": 49.9375 }, { "clip_ratio": 0.0, "completion_length": 121.40625, "epoch": 2.1163575042158516, "grad_norm": 782078.1110492643, "kl": 235520.27734375, "learning_rate": 5.774030354131534e-07, "loss": 236.1547, "reward": 3.241525888442993, "reward_std": 0.1571502909064293, "rewards/final_reward": 1.521762280870485, "rewards/mask_iou_reward": 0.7608811404352425, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2571508586406708, "rewards/thk_ans_format_reward": 0.984375, "step": 1253, "think_completion_length": 50.125 }, { "clip_ratio": 0.0, "completion_length": 126.625, "epoch": 2.118043844856661, "grad_norm": 8.010863739990121, "kl": 0.5009765625, "learning_rate": 5.770657672849916e-07, "loss": 0.0005, "reward": 2.98062264919281, "reward_std": 0.07590826600790024, "rewards/final_reward": 0.8288698686168807, "rewards/mask_iou_reward": 0.41443493430844036, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9806226491928101, "rewards/thk_ans_format_reward": 1.0, "step": 1254, "think_completion_length": 56.4375 }, { "clip_ratio": 0.0, "completion_length": 116.484375, "epoch": 2.1197301854974704, "grad_norm": 17.237015051028266, "kl": 0.5048828125, "learning_rate": 5.767284991568296e-07, "loss": 0.0005, "reward": 3.5511358976364136, "reward_std": 0.06634262204170227, "rewards/final_reward": 1.6558075137380417, "rewards/mask_iou_reward": 0.8279037568690208, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.551135778427124, "rewards/thk_ans_format_reward": 1.0, "step": 1255, "think_completion_length": 45.78125 }, { "clip_ratio": 0.0, "completion_length": 118.421875, "epoch": 2.12141652613828, "grad_norm": 22.379463074628816, "kl": 0.4677734375, "learning_rate": 5.763912310286677e-07, "loss": 0.0005, "reward": 3.63353431224823, "reward_std": 0.030599688179790974, "rewards/final_reward": 1.3697178234861853, "rewards/mask_iou_reward": 0.6848589117430927, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.63353431224823, "rewards/thk_ans_format_reward": 1.0, "step": 1256, "think_completion_length": 47.8125 }, { "clip_ratio": 0.0, "completion_length": 110.546875, "epoch": 2.123102866779089, "grad_norm": 8.866637777736178, "kl": 0.51171875, "learning_rate": 5.760539629005059e-07, "loss": 0.0005, "reward": 3.4119359254837036, "reward_std": 0.39036141335964203, "rewards/final_reward": 1.3524345485826261, "rewards/mask_iou_reward": 0.6762172742913131, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4119358658790588, "rewards/thk_ans_format_reward": 1.0, "step": 1257, "think_completion_length": 41.40625 }, { "clip_ratio": 0.0, "completion_length": 120.75, "epoch": 2.124789207419899, "grad_norm": 7.424783872127569, "kl": 0.490234375, "learning_rate": 5.75716694772344e-07, "loss": 0.0005, "reward": 3.281718134880066, "reward_std": 0.17376143485307693, "rewards/final_reward": 1.2578834852308733, "rewards/mask_iou_reward": 0.6289417426154367, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2817181050777435, "rewards/thk_ans_format_reward": 1.0, "step": 1258, "think_completion_length": 52.03125 }, { "clip_ratio": 0.0, "completion_length": 114.046875, "epoch": 2.126475548060708, "grad_norm": 8.587208587499049, "kl": 0.533203125, "learning_rate": 5.753794266441822e-07, "loss": 0.0006, "reward": 3.0733892917633057, "reward_std": 0.18975719437003136, "rewards/final_reward": 0.683115645927564, "rewards/mask_iou_reward": 0.341557822963782, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0733892023563385, "rewards/thk_ans_format_reward": 1.0, "step": 1259, "think_completion_length": 45.25 }, { "clip_ratio": 0.0, "completion_length": 112.734375, "epoch": 2.1281618887015177, "grad_norm": 8.018608509495962, "kl": 0.474609375, "learning_rate": 5.750421585160202e-07, "loss": 0.0005, "reward": 3.535109758377075, "reward_std": 0.08410511817783117, "rewards/final_reward": 1.7809249511905496, "rewards/mask_iou_reward": 0.8904624755952748, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5351097583770752, "rewards/thk_ans_format_reward": 1.0, "step": 1260, "think_completion_length": 39.9375 }, { "clip_ratio": 0.0, "completion_length": 116.859375, "epoch": 2.1298482293423273, "grad_norm": 9.534473471390374, "kl": 0.52734375, "learning_rate": 5.747048903878583e-07, "loss": 0.0006, "reward": 3.028801918029785, "reward_std": 0.12135305255651474, "rewards/final_reward": 1.189611819827427, "rewards/mask_iou_reward": 0.5948059099137135, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.028801828622818, "rewards/thk_ans_format_reward": 1.0, "step": 1261, "think_completion_length": 46.0625 }, { "clip_ratio": 0.0, "completion_length": 118.203125, "epoch": 2.1315345699831365, "grad_norm": 6.395713232763334, "kl": 0.509765625, "learning_rate": 5.743676222596965e-07, "loss": 0.0005, "reward": 3.2245901823043823, "reward_std": 0.10203266330063343, "rewards/final_reward": 1.3326993714609847, "rewards/mask_iou_reward": 0.6663496857304924, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.224590003490448, "rewards/thk_ans_format_reward": 1.0, "step": 1262, "think_completion_length": 44.34375 }, { "clip_ratio": 0.0, "completion_length": 117.40625, "epoch": 2.133220910623946, "grad_norm": 9.683686130868914, "kl": 1.009765625, "learning_rate": 5.740303541315346e-07, "loss": 0.001, "reward": 2.9035420417785645, "reward_std": 0.2598446160554886, "rewards/final_reward": 0.7311843953140473, "rewards/mask_iou_reward": 0.36559219765702367, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9035422205924988, "rewards/thk_ans_format_reward": 1.0, "step": 1263, "think_completion_length": 48.40625 }, { "clip_ratio": 0.0, "completion_length": 133.234375, "epoch": 2.1349072512647553, "grad_norm": 16.42792138501997, "kl": 0.4375, "learning_rate": 5.736930860033726e-07, "loss": 0.0004, "reward": 3.252814292907715, "reward_std": 0.07177349179983139, "rewards/final_reward": 1.13756073486116, "rewards/mask_iou_reward": 0.56878036743058, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2528142929077148, "rewards/thk_ans_format_reward": 1.0, "step": 1264, "think_completion_length": 42.9375 }, { "clip_ratio": 0.0, "completion_length": 118.09375, "epoch": 2.136593591905565, "grad_norm": 16.85648174584554, "kl": 0.517578125, "learning_rate": 5.733558178752108e-07, "loss": 0.0005, "reward": 3.448030710220337, "reward_std": 0.1371849738061428, "rewards/final_reward": 1.4135558012866825, "rewards/mask_iou_reward": 0.7067779006433412, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4480305314064026, "rewards/thk_ans_format_reward": 1.0, "step": 1265, "think_completion_length": 45.21875 }, { "clip_ratio": 0.0, "completion_length": 116.09375, "epoch": 2.138279932546374, "grad_norm": 6.680793787636658, "kl": 0.564453125, "learning_rate": 5.730185497470489e-07, "loss": 0.0004, "reward": 3.5273772478103638, "reward_std": 0.011883015278726816, "rewards/final_reward": 1.1562473751345796, "rewards/mask_iou_reward": 0.5781236875672898, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5273773074150085, "rewards/thk_ans_format_reward": 1.0, "step": 1266, "think_completion_length": 48.75 }, { "clip_ratio": 0.0, "completion_length": 114.515625, "epoch": 2.139966273187184, "grad_norm": 8.837263046519144, "kl": 0.5380859375, "learning_rate": 5.72681281618887e-07, "loss": 0.0005, "reward": 3.2997041940689087, "reward_std": 0.49166443943977356, "rewards/final_reward": 1.1886923108384135, "rewards/mask_iou_reward": 0.5943461554192068, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2997040748596191, "rewards/thk_ans_format_reward": 1.0, "step": 1267, "think_completion_length": 41.5 }, { "clip_ratio": 0.0, "completion_length": 113.40625, "epoch": 2.1416526138279934, "grad_norm": 9.282787614222825, "kl": 0.505859375, "learning_rate": 5.723440134907252e-07, "loss": 0.0005, "reward": 3.345510482788086, "reward_std": 0.002183900447562337, "rewards/final_reward": 0.9688242810184894, "rewards/mask_iou_reward": 0.4844121405092447, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3455105423927307, "rewards/thk_ans_format_reward": 1.0, "step": 1268, "think_completion_length": 44.96875 }, { "clip_ratio": 0.0, "completion_length": 113.84375, "epoch": 2.1433389544688026, "grad_norm": 9.295865669291606, "kl": 0.501953125, "learning_rate": 5.720067453625632e-07, "loss": 0.0005, "reward": 3.7576842308044434, "reward_std": 0.04031405784189701, "rewards/final_reward": 1.6845229352375926, "rewards/mask_iou_reward": 0.8422614676187963, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7576842308044434, "rewards/thk_ans_format_reward": 1.0, "step": 1269, "think_completion_length": 46.0625 }, { "clip_ratio": 0.0, "completion_length": 115.15625, "epoch": 2.1450252951096123, "grad_norm": 11.174241717616878, "kl": 0.484375, "learning_rate": 5.716694772344013e-07, "loss": 0.0004, "reward": 3.5124897956848145, "reward_std": 0.03136880323290825, "rewards/final_reward": 1.4931899922318204, "rewards/mask_iou_reward": 0.7465949961159102, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5124897956848145, "rewards/thk_ans_format_reward": 1.0, "step": 1270, "think_completion_length": 47.46875 }, { "clip_ratio": 0.0, "completion_length": 116.5625, "epoch": 2.1467116357504215, "grad_norm": 15.755950032963167, "kl": 0.484375, "learning_rate": 5.713322091062395e-07, "loss": 0.0005, "reward": 3.5956294536590576, "reward_std": 0.09438778925687075, "rewards/final_reward": 1.4855681345253289, "rewards/mask_iou_reward": 0.7427840672626644, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5956295132637024, "rewards/thk_ans_format_reward": 1.0, "step": 1271, "think_completion_length": 47.46875 }, { "clip_ratio": 0.0, "completion_length": 114.390625, "epoch": 2.148397976391231, "grad_norm": 5.144881924672007, "kl": 0.52734375, "learning_rate": 5.709949409780775e-07, "loss": 0.0005, "reward": 3.665688157081604, "reward_std": 0.10815348476171494, "rewards/final_reward": 1.5873789454257476, "rewards/mask_iou_reward": 0.7936894727128738, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6656880974769592, "rewards/thk_ans_format_reward": 1.0, "step": 1272, "think_completion_length": 44.09375 }, { "clip_ratio": 0.0, "completion_length": 109.90625, "epoch": 2.1500843170320403, "grad_norm": 30.158297132887448, "kl": 0.53125, "learning_rate": 5.706576728499156e-07, "loss": 0.0005, "reward": 3.2382161617279053, "reward_std": 0.06967388093471527, "rewards/final_reward": 1.1723690364705708, "rewards/mask_iou_reward": 0.5861845182352854, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2382160425186157, "rewards/thk_ans_format_reward": 1.0, "step": 1273, "think_completion_length": 42.28125 }, { "clip_ratio": 0.0, "completion_length": 109.84375, "epoch": 2.15177065767285, "grad_norm": 7.935702624105307, "kl": 0.509765625, "learning_rate": 5.703204047217538e-07, "loss": 0.0005, "reward": 3.0919833183288574, "reward_std": 0.049351561814546585, "rewards/final_reward": 1.068108735833558, "rewards/mask_iou_reward": 0.534054367916779, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0919832587242126, "rewards/thk_ans_format_reward": 1.0, "step": 1274, "think_completion_length": 38.0625 }, { "clip_ratio": 0.0, "completion_length": 112.40625, "epoch": 2.1534569983136596, "grad_norm": 14.373076999773213, "kl": 0.4970703125, "learning_rate": 5.699831365935919e-07, "loss": 0.0005, "reward": 3.308625102043152, "reward_std": 0.17064137011766434, "rewards/final_reward": 0.8674972316005486, "rewards/mask_iou_reward": 0.4337486158002743, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3086249232292175, "rewards/thk_ans_format_reward": 1.0, "step": 1275, "think_completion_length": 41.625 }, { "clip_ratio": 0.0, "completion_length": 115.890625, "epoch": 2.1551433389544687, "grad_norm": 6.085309378106807, "kl": 0.474609375, "learning_rate": 5.6964586846543e-07, "loss": 0.0005, "reward": 3.5639824867248535, "reward_std": 0.03305263817310333, "rewards/final_reward": 1.417472939633353, "rewards/mask_iou_reward": 0.7087364698166765, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.563982605934143, "rewards/thk_ans_format_reward": 1.0, "step": 1276, "think_completion_length": 43.34375 }, { "clip_ratio": 0.0, "completion_length": 112.25, "epoch": 2.1568296795952784, "grad_norm": 17.903092457178236, "kl": 0.52734375, "learning_rate": 5.693086003372681e-07, "loss": 0.0005, "reward": 3.7228747606277466, "reward_std": 0.13351622968912125, "rewards/final_reward": 1.6980407653162277, "rewards/mask_iou_reward": 0.8490203826581139, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7228747606277466, "rewards/thk_ans_format_reward": 1.0, "step": 1277, "think_completion_length": 42.75 }, { "clip_ratio": 0.0, "completion_length": 118.640625, "epoch": 2.1585160202360876, "grad_norm": 9.2663164314058, "kl": 0.501953125, "learning_rate": 5.689713322091062e-07, "loss": 0.0005, "reward": 3.1014504432678223, "reward_std": 0.09565165266394615, "rewards/final_reward": 1.5953974965658613, "rewards/mask_iou_reward": 0.7976987482829306, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1014504432678223, "rewards/thk_ans_format_reward": 1.0, "step": 1278, "think_completion_length": 40.90625 }, { "clip_ratio": 0.0, "completion_length": 111.515625, "epoch": 2.160202360876897, "grad_norm": 5.838692379118382, "kl": 0.529296875, "learning_rate": 5.686340640809443e-07, "loss": 0.0005, "reward": 3.6012667417526245, "reward_std": 0.011416994035243988, "rewards/final_reward": 1.5336660571261627, "rewards/mask_iou_reward": 0.7668330285630813, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6012668013572693, "rewards/thk_ans_format_reward": 1.0, "step": 1279, "think_completion_length": 41.625 }, { "clip_ratio": 0.0, "completion_length": 110.875, "epoch": 2.1618887015177064, "grad_norm": 17.41323672321895, "kl": 0.513671875, "learning_rate": 5.682967959527824e-07, "loss": 0.0005, "reward": 3.361006498336792, "reward_std": 0.034786591306328773, "rewards/final_reward": 1.9065231356008847, "rewards/mask_iou_reward": 0.9532615678004424, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3610064387321472, "rewards/thk_ans_format_reward": 1.0, "step": 1280, "think_completion_length": 40.65625 }, { "clip_ratio": 0.0, "completion_length": 111.703125, "epoch": 2.163575042158516, "grad_norm": 5.0781830994598085, "kl": 0.5234375, "learning_rate": 5.679595278246205e-07, "loss": 0.0005, "reward": 3.4662747383117676, "reward_std": 0.2071358636021614, "rewards/final_reward": 1.8022344144039115, "rewards/mask_iou_reward": 0.9011172072019558, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4662747383117676, "rewards/thk_ans_format_reward": 1.0, "step": 1281, "think_completion_length": 40.71875 }, { "clip_ratio": 0.0, "completion_length": 111.46875, "epoch": 2.1652613827993257, "grad_norm": 5.313808569171344, "kl": 0.51953125, "learning_rate": 5.676222596964586e-07, "loss": 0.0005, "reward": 3.129016160964966, "reward_std": 0.12660411186516285, "rewards/final_reward": 1.2375794669776226, "rewards/mask_iou_reward": 0.6187897334888113, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1290162205696106, "rewards/thk_ans_format_reward": 1.0, "step": 1282, "think_completion_length": 39.28125 }, { "clip_ratio": 0.0, "completion_length": 134.0625, "epoch": 2.166947723440135, "grad_norm": 8.237313862885776, "kl": 0.4453125, "learning_rate": 5.672849915682968e-07, "loss": 0.0004, "reward": 3.0471763610839844, "reward_std": 0.3178776204586029, "rewards/final_reward": 0.6561711341302512, "rewards/mask_iou_reward": 0.3280855670651256, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0471762418746948, "rewards/thk_ans_format_reward": 1.0, "step": 1283, "think_completion_length": 41.9375 }, { "clip_ratio": 0.0, "completion_length": 142.78125, "epoch": 2.1686340640809445, "grad_norm": 5.8652658890448865, "kl": 0.4482421875, "learning_rate": 5.669477234401349e-07, "loss": 0.0005, "reward": 3.6685824394226074, "reward_std": 0.06685462407767773, "rewards/final_reward": 1.8848773448353418, "rewards/mask_iou_reward": 0.9424386724176709, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6685824990272522, "rewards/thk_ans_format_reward": 1.0, "step": 1284, "think_completion_length": 36.84375 }, { "clip_ratio": 0.0, "completion_length": 109.0625, "epoch": 2.1703204047217537, "grad_norm": 5.837632353873461, "kl": 0.5546875, "learning_rate": 5.666104553119731e-07, "loss": 0.0005, "reward": 3.229220151901245, "reward_std": 0.02632999565685168, "rewards/final_reward": 1.788056331911195, "rewards/mask_iou_reward": 0.8940281659555975, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2292202413082123, "rewards/thk_ans_format_reward": 1.0, "step": 1285, "think_completion_length": 38.59375 }, { "clip_ratio": 0.0, "completion_length": 141.015625, "epoch": 2.1720067453625633, "grad_norm": 4.7995000021626115, "kl": 0.49609375, "learning_rate": 5.662731871838111e-07, "loss": 0.0005, "reward": 3.326902389526367, "reward_std": 0.07282107695937157, "rewards/final_reward": 0.9955913918744543, "rewards/mask_iou_reward": 0.49779569593722717, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3269023895263672, "rewards/thk_ans_format_reward": 1.0, "step": 1286, "think_completion_length": 36.34375 }, { "clip_ratio": 0.0, "completion_length": 116.890625, "epoch": 2.1736930860033725, "grad_norm": 10.813729699788505, "kl": 0.53515625, "learning_rate": 5.659359190556492e-07, "loss": 0.0005, "reward": 3.4771808385849, "reward_std": 0.19824712723493576, "rewards/final_reward": 1.4848713419331876, "rewards/mask_iou_reward": 0.7424356709665938, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4771807789802551, "rewards/thk_ans_format_reward": 1.0, "step": 1287, "think_completion_length": 37.125 }, { "clip_ratio": 0.0, "completion_length": 123.0625, "epoch": 2.175379426644182, "grad_norm": 11.319448877957019, "kl": 0.513671875, "learning_rate": 5.655986509274874e-07, "loss": 0.0005, "reward": 2.9048445224761963, "reward_std": 0.06361216679215431, "rewards/final_reward": 0.8371745185363462, "rewards/mask_iou_reward": 0.4185872592681731, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9048446416854858, "rewards/thk_ans_format_reward": 1.0, "step": 1288, "think_completion_length": 41.125 }, { "clip_ratio": 0.0, "completion_length": 110.671875, "epoch": 2.177065767284992, "grad_norm": 12.218867399167939, "kl": 0.5546875, "learning_rate": 5.652613827993254e-07, "loss": 0.0006, "reward": 3.2906709909439087, "reward_std": 0.03546382300555706, "rewards/final_reward": 0.9750635697684437, "rewards/mask_iou_reward": 0.48753178488422183, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2906709909439087, "rewards/thk_ans_format_reward": 1.0, "step": 1289, "think_completion_length": 31.84375 }, { "clip_ratio": 0.0, "completion_length": 125.875, "epoch": 2.178752107925801, "grad_norm": 12.713163442429922, "kl": 0.46484375, "learning_rate": 5.649241146711635e-07, "loss": 0.0005, "reward": 3.008612871170044, "reward_std": 0.2824648320674896, "rewards/final_reward": 0.9454986306429427, "rewards/mask_iou_reward": 0.47274931532147135, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.008612722158432, "rewards/thk_ans_format_reward": 1.0, "step": 1290, "think_completion_length": 40.96875 }, { "clip_ratio": 0.0, "completion_length": 109.03125, "epoch": 2.1804384485666106, "grad_norm": 26.08829645776549, "kl": 0.51171875, "learning_rate": 5.645868465430017e-07, "loss": 0.0005, "reward": 3.0946640968322754, "reward_std": 0.3881853371858597, "rewards/final_reward": 1.1850956641725252, "rewards/mask_iou_reward": 0.5925478320862626, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.094664067029953, "rewards/thk_ans_format_reward": 1.0, "step": 1291, "think_completion_length": 39.5625 }, { "clip_ratio": 0.0, "completion_length": 112.34375, "epoch": 2.18212478920742, "grad_norm": 8.58694853252235, "kl": 0.5859375, "learning_rate": 5.642495784148398e-07, "loss": 0.0006, "reward": 3.3137210607528687, "reward_std": 0.0238959863781929, "rewards/final_reward": 0.9728564177071632, "rewards/mask_iou_reward": 0.4864282088535816, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3137210607528687, "rewards/thk_ans_format_reward": 1.0, "step": 1292, "think_completion_length": 45.75 }, { "clip_ratio": 0.0, "completion_length": 102.6875, "epoch": 2.1838111298482294, "grad_norm": 9.334431581768936, "kl": 0.59765625, "learning_rate": 5.639123102866779e-07, "loss": 0.0006, "reward": 3.366030693054199, "reward_std": 0.25199363101273775, "rewards/final_reward": 1.5338307014555888, "rewards/mask_iou_reward": 0.7669153507277944, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3660306930541992, "rewards/thk_ans_format_reward": 1.0, "step": 1293, "think_completion_length": 43.5625 }, { "clip_ratio": 0.0, "completion_length": 106.4375, "epoch": 2.1854974704890386, "grad_norm": 7.261057150063994, "kl": 0.5703125, "learning_rate": 5.635750421585161e-07, "loss": 0.0006, "reward": 3.1913743019104004, "reward_std": 0.09568795189261436, "rewards/final_reward": 0.8999432986526881, "rewards/mask_iou_reward": 0.44997164932634404, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1913742423057556, "rewards/thk_ans_format_reward": 1.0, "step": 1294, "think_completion_length": 38.78125 }, { "clip_ratio": 0.0, "completion_length": 107.9375, "epoch": 2.1871838111298483, "grad_norm": 11.890263919453417, "kl": 0.537109375, "learning_rate": 5.632377740303541e-07, "loss": 0.0005, "reward": 3.1532232761383057, "reward_std": 0.568440705537796, "rewards/final_reward": 1.0693202949044516, "rewards/mask_iou_reward": 0.5346601474522258, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1532232761383057, "rewards/thk_ans_format_reward": 1.0, "step": 1295, "think_completion_length": 33.25 }, { "clip_ratio": 0.0, "completion_length": 110.65625, "epoch": 2.1888701517706575, "grad_norm": 29.777276960107326, "kl": 0.564453125, "learning_rate": 5.629005059021922e-07, "loss": 0.0006, "reward": 3.4506603479385376, "reward_std": 0.21746337413787842, "rewards/final_reward": 1.3186164815143446, "rewards/mask_iou_reward": 0.6593082407571723, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4506604075431824, "rewards/thk_ans_format_reward": 1.0, "step": 1296, "think_completion_length": 34.90625 }, { "clip_ratio": 0.0, "completion_length": 103.875, "epoch": 2.190556492411467, "grad_norm": 10.137710080819277, "kl": 0.515625, "learning_rate": 5.625632377740303e-07, "loss": 0.0005, "reward": 3.43321430683136, "reward_std": 0.18915096670389175, "rewards/final_reward": 1.2422396347537354, "rewards/mask_iou_reward": 0.6211198173768677, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4332143068313599, "rewards/thk_ans_format_reward": 1.0, "step": 1297, "think_completion_length": 32.21875 }, { "clip_ratio": 0.0, "completion_length": 106.765625, "epoch": 2.1922428330522767, "grad_norm": 8.844318257916903, "kl": 0.5546875, "learning_rate": 5.622259696458684e-07, "loss": 0.0005, "reward": 3.1780149936676025, "reward_std": 0.08533276757225394, "rewards/final_reward": 0.7760199678805226, "rewards/mask_iou_reward": 0.3880099839402613, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1780150532722473, "rewards/thk_ans_format_reward": 1.0, "step": 1298, "think_completion_length": 36.5625 }, { "clip_ratio": 0.0, "completion_length": 105.828125, "epoch": 2.193929173693086, "grad_norm": 14.424263342320263, "kl": 0.53125, "learning_rate": 5.618887015177065e-07, "loss": 0.0005, "reward": 3.1475911140441895, "reward_std": 0.3342314139008522, "rewards/final_reward": 1.5412919740557918, "rewards/mask_iou_reward": 0.7706459870278959, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.147591084241867, "rewards/thk_ans_format_reward": 1.0, "step": 1299, "think_completion_length": 36.0625 }, { "clip_ratio": 0.0, "completion_length": 159.578125, "epoch": 2.1956155143338956, "grad_norm": 169.85177106817954, "kl": 0.458984375, "learning_rate": 5.615514333895447e-07, "loss": 0.0005, "reward": 3.320096254348755, "reward_std": 0.22164292633533478, "rewards/final_reward": 1.459633178376593, "rewards/mask_iou_reward": 0.7298165891882965, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3200964331626892, "rewards/thk_ans_format_reward": 1.0, "step": 1300, "think_completion_length": 36.1875 }, { "clip_ratio": 0.0, "completion_length": 109.921875, "epoch": 2.1973018549747048, "grad_norm": 9.054714053372589, "kl": 0.49609375, "learning_rate": 5.612141652613828e-07, "loss": 0.0005, "reward": 3.2595558166503906, "reward_std": 0.3579244762659073, "rewards/final_reward": 1.6795588356267266, "rewards/mask_iou_reward": 0.8397794178133633, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2595559358596802, "rewards/thk_ans_format_reward": 1.0, "step": 1301, "think_completion_length": 39.03125 }, { "clip_ratio": 0.0, "completion_length": 105.90625, "epoch": 2.1989881956155144, "grad_norm": 7.766385435361891, "kl": 0.529296875, "learning_rate": 5.608768971332209e-07, "loss": 0.0005, "reward": 3.2027809619903564, "reward_std": 0.13372624665498734, "rewards/final_reward": 0.9463063106918193, "rewards/mask_iou_reward": 0.4731531553459096, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2027809023857117, "rewards/thk_ans_format_reward": 1.0, "step": 1302, "think_completion_length": 34.28125 }, { "clip_ratio": 0.0, "completion_length": 106.53125, "epoch": 2.2006745362563236, "grad_norm": 17.516372215159315, "kl": 0.58984375, "learning_rate": 5.605396290050591e-07, "loss": 0.0006, "reward": 3.0270960330963135, "reward_std": 0.1305270530283451, "rewards/final_reward": 0.9893757757420829, "rewards/mask_iou_reward": 0.49468788787104145, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0270961076021194, "rewards/thk_ans_format_reward": 1.0, "step": 1303, "think_completion_length": 34.3125 }, { "clip_ratio": 0.0, "completion_length": 109.953125, "epoch": 2.2023608768971332, "grad_norm": 9.152942586295097, "kl": 0.494140625, "learning_rate": 5.602023608768971e-07, "loss": 0.0005, "reward": 3.0987539291381836, "reward_std": 0.21952814608812332, "rewards/final_reward": 1.2299166486700184, "rewards/mask_iou_reward": 0.6149583243350092, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0987539291381836, "rewards/thk_ans_format_reward": 1.0, "step": 1304, "think_completion_length": 41.25 }, { "clip_ratio": 0.0, "completion_length": 120.28125, "epoch": 2.204047217537943, "grad_norm": 10.836055739499022, "kl": 0.55859375, "learning_rate": 5.598650927487351e-07, "loss": 0.0006, "reward": 3.4143790006637573, "reward_std": 0.2662373185157776, "rewards/final_reward": 1.3339210672457629, "rewards/mask_iou_reward": 0.6669605336228814, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4143790006637573, "rewards/thk_ans_format_reward": 1.0, "step": 1305, "think_completion_length": 35.09375 }, { "clip_ratio": 0.0, "completion_length": 96.140625, "epoch": 2.205733558178752, "grad_norm": 8.93654538137696, "kl": 0.548828125, "learning_rate": 5.595278246205733e-07, "loss": 0.0005, "reward": 3.10241162776947, "reward_std": 0.39101406559348106, "rewards/final_reward": 0.8648975132692235, "rewards/mask_iou_reward": 0.43244875663461174, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1024115979671478, "rewards/thk_ans_format_reward": 1.0, "step": 1306, "think_completion_length": 35.625 }, { "clip_ratio": 0.0, "completion_length": 117.265625, "epoch": 2.2074198988195617, "grad_norm": 15.25434670003201, "kl": 0.54296875, "learning_rate": 5.591905564924114e-07, "loss": 0.0005, "reward": 3.0681989192962646, "reward_std": 0.301235631108284, "rewards/final_reward": 0.7502557477642079, "rewards/mask_iou_reward": 0.37512787388210395, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0681988894939423, "rewards/thk_ans_format_reward": 1.0, "step": 1307, "think_completion_length": 40.40625 }, { "clip_ratio": 0.0, "completion_length": 141.46875, "epoch": 2.209106239460371, "grad_norm": 17.605437350876464, "kl": 0.4931640625, "learning_rate": 5.588532883642495e-07, "loss": 0.0005, "reward": 3.771380066871643, "reward_std": 0.11449036654084921, "rewards/final_reward": 1.781286348007805, "rewards/mask_iou_reward": 0.8906431740039025, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.771379828453064, "rewards/thk_ans_format_reward": 1.0, "step": 1308, "think_completion_length": 42.0625 }, { "clip_ratio": 0.0, "completion_length": 109.234375, "epoch": 2.2107925801011805, "grad_norm": 14.745100662483619, "kl": 0.541015625, "learning_rate": 5.585160202360877e-07, "loss": 0.0006, "reward": 3.74048388004303, "reward_std": 0.01679701777175069, "rewards/final_reward": 1.5908809315176096, "rewards/mask_iou_reward": 0.7954404657588048, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.740483820438385, "rewards/thk_ans_format_reward": 1.0, "step": 1309, "think_completion_length": 41.1875 }, { "clip_ratio": 0.0, "completion_length": 142.75, "epoch": 2.2124789207419897, "grad_norm": 5.431669802578124, "kl": 0.5029296875, "learning_rate": 5.581787521079258e-07, "loss": 0.0005, "reward": 3.6262569427490234, "reward_std": 0.1467602625489235, "rewards/final_reward": 1.7593725993227114, "rewards/mask_iou_reward": 0.8796862996613557, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6262570023536682, "rewards/thk_ans_format_reward": 1.0, "step": 1310, "think_completion_length": 36.375 }, { "clip_ratio": 0.0, "completion_length": 95.578125, "epoch": 2.2141652613827993, "grad_norm": 6.749141833190477, "kl": 0.5625, "learning_rate": 5.57841483979764e-07, "loss": 0.0006, "reward": 3.4961687326431274, "reward_std": 0.1268460345454514, "rewards/final_reward": 1.882680523286068, "rewards/mask_iou_reward": 0.941340261643034, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4961687326431274, "rewards/thk_ans_format_reward": 1.0, "step": 1311, "think_completion_length": 38.625 }, { "clip_ratio": 0.0, "completion_length": 108.59375, "epoch": 2.2158516020236085, "grad_norm": 13.437066869204576, "kl": 0.50390625, "learning_rate": 5.575042158516021e-07, "loss": 0.0005, "reward": 3.282013177871704, "reward_std": 0.3590443627908826, "rewards/final_reward": 1.405391687345856, "rewards/mask_iou_reward": 0.702695843672928, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.282013177871704, "rewards/thk_ans_format_reward": 1.0, "step": 1312, "think_completion_length": 42.34375 }, { "clip_ratio": 0.0, "completion_length": 105.484375, "epoch": 2.217537942664418, "grad_norm": 6.271928895012018, "kl": 0.58984375, "learning_rate": 5.5716694772344e-07, "loss": 0.0006, "reward": 3.3246023654937744, "reward_std": 0.1027023196220398, "rewards/final_reward": 0.9758942077909897, "rewards/mask_iou_reward": 0.48794710389549484, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.324602335691452, "rewards/thk_ans_format_reward": 1.0, "step": 1313, "think_completion_length": 35.25 }, { "clip_ratio": 0.0, "completion_length": 132.828125, "epoch": 2.219224283305228, "grad_norm": 13.85784093229938, "kl": 0.546875, "learning_rate": 5.568296795952782e-07, "loss": 0.0005, "reward": 3.347952723503113, "reward_std": 0.1474056839942932, "rewards/final_reward": 1.0762541446075893, "rewards/mask_iou_reward": 0.5381270723037946, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3479526042938232, "rewards/thk_ans_format_reward": 1.0, "step": 1314, "think_completion_length": 40.03125 }, { "clip_ratio": 0.0, "completion_length": 104.609375, "epoch": 2.220910623946037, "grad_norm": 24.114698680934087, "kl": 0.60546875, "learning_rate": 5.564924114671163e-07, "loss": 0.0006, "reward": 3.19465708732605, "reward_std": 0.1294799353927374, "rewards/final_reward": 1.13020496978562, "rewards/mask_iou_reward": 0.56510248489281, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1946569681167603, "rewards/thk_ans_format_reward": 1.0, "step": 1315, "think_completion_length": 37.65625 }, { "clip_ratio": 0.0, "completion_length": 147.859375, "epoch": 2.2225969645868466, "grad_norm": 40.02479739699717, "kl": 0.486328125, "learning_rate": 5.561551433389544e-07, "loss": 0.0005, "reward": 3.1586406230926514, "reward_std": 0.19064200669527054, "rewards/final_reward": 0.9896240409602192, "rewards/mask_iou_reward": 0.4948120204801096, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1586405336856842, "rewards/thk_ans_format_reward": 1.0, "step": 1316, "think_completion_length": 36.46875 }, { "clip_ratio": 0.0, "completion_length": 141.46875, "epoch": 2.224283305227656, "grad_norm": 6.046357665038441, "kl": 0.5341796875, "learning_rate": 5.558178752107926e-07, "loss": 0.0005, "reward": 2.572770595550537, "reward_std": 0.11222269444260746, "rewards/final_reward": 0.11813676432442753, "rewards/mask_iou_reward": 0.059068382162213766, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5727706551551819, "rewards/thk_ans_format_reward": 1.0, "step": 1317, "think_completion_length": 41.1875 }, { "clip_ratio": 0.0, "completion_length": 148.203125, "epoch": 2.2259696458684655, "grad_norm": 11.749521563087038, "kl": 0.4873046875, "learning_rate": 5.554806070826307e-07, "loss": 0.0005, "reward": 3.731950521469116, "reward_std": 0.06127586215734482, "rewards/final_reward": 1.704703487748192, "rewards/mask_iou_reward": 0.852351743874096, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.731950581073761, "rewards/thk_ans_format_reward": 1.0, "step": 1318, "think_completion_length": 34.375 }, { "clip_ratio": 0.0, "completion_length": 103.15625, "epoch": 2.2276559865092747, "grad_norm": 9.448524216182863, "kl": 0.55078125, "learning_rate": 5.551433389544688e-07, "loss": 0.0006, "reward": 3.561526656150818, "reward_std": 0.017425385303795338, "rewards/final_reward": 1.835540433113184, "rewards/mask_iou_reward": 0.917770216556592, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5615267157554626, "rewards/thk_ans_format_reward": 1.0, "step": 1319, "think_completion_length": 31.4375 }, { "clip_ratio": 0.0, "completion_length": 107.046875, "epoch": 2.2293423271500843, "grad_norm": 17.403947483194223, "kl": 0.576171875, "learning_rate": 5.54806070826307e-07, "loss": 0.0006, "reward": 3.371973991394043, "reward_std": 0.29090772196650505, "rewards/final_reward": 0.8479355704681258, "rewards/mask_iou_reward": 0.4239677852340629, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3719740509986877, "rewards/thk_ans_format_reward": 1.0, "step": 1320, "think_completion_length": 39.125 }, { "clip_ratio": 0.0, "completion_length": 109.59375, "epoch": 2.231028667790894, "grad_norm": 5.326169338549298, "kl": 0.53125, "learning_rate": 5.544688026981449e-07, "loss": 0.0005, "reward": 3.568778872489929, "reward_std": 0.3622869700193405, "rewards/final_reward": 1.4941932238839515, "rewards/mask_iou_reward": 0.7470966119419757, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5687788724899292, "rewards/thk_ans_format_reward": 1.0, "step": 1321, "think_completion_length": 38.0 }, { "clip_ratio": 0.0, "completion_length": 106.0625, "epoch": 2.232715008431703, "grad_norm": 14.990765910375735, "kl": 0.541015625, "learning_rate": 5.54131534569983e-07, "loss": 0.0005, "reward": 3.765368938446045, "reward_std": 0.04102184996008873, "rewards/final_reward": 1.6718389758842933, "rewards/mask_iou_reward": 0.8359194879421467, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.765368938446045, "rewards/thk_ans_format_reward": 1.0, "step": 1322, "think_completion_length": 32.25 }, { "clip_ratio": 0.0, "completion_length": 105.953125, "epoch": 2.2344013490725128, "grad_norm": 9.527736157533576, "kl": 0.5, "learning_rate": 5.537942664418212e-07, "loss": 0.0005, "reward": 3.199765205383301, "reward_std": 0.1792975813150406, "rewards/final_reward": 1.5867222237886813, "rewards/mask_iou_reward": 0.7933611118943407, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1997651159763336, "rewards/thk_ans_format_reward": 1.0, "step": 1323, "think_completion_length": 35.875 }, { "clip_ratio": 0.0, "completion_length": 113.046875, "epoch": 2.236087689713322, "grad_norm": 11.753149753032291, "kl": 0.486328125, "learning_rate": 5.534569983136593e-07, "loss": 0.0005, "reward": 3.013986587524414, "reward_std": 0.09028564766049385, "rewards/final_reward": 1.502550902240183, "rewards/mask_iou_reward": 0.7512754511200915, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.013986587524414, "rewards/thk_ans_format_reward": 1.0, "step": 1324, "think_completion_length": 42.15625 }, { "clip_ratio": 0.0, "completion_length": 107.03125, "epoch": 2.2377740303541316, "grad_norm": 5.303122418256821, "kl": 0.556640625, "learning_rate": 5.531197301854974e-07, "loss": 0.0006, "reward": 3.7323914766311646, "reward_std": 0.015024483669549227, "rewards/final_reward": 1.5586951757322525, "rewards/mask_iou_reward": 0.7793475878661262, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7323914170265198, "rewards/thk_ans_format_reward": 1.0, "step": 1325, "think_completion_length": 37.5625 }, { "clip_ratio": 0.0, "completion_length": 106.546875, "epoch": 2.2394603709949408, "grad_norm": 6.777899205372352, "kl": 0.537109375, "learning_rate": 5.527824620573356e-07, "loss": 0.0005, "reward": 3.7131404876708984, "reward_std": 0.25278275832533836, "rewards/final_reward": 1.6623295700856646, "rewards/mask_iou_reward": 0.8311647850428323, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7131404876708984, "rewards/thk_ans_format_reward": 1.0, "step": 1326, "think_completion_length": 33.6875 }, { "clip_ratio": 0.0, "completion_length": 111.40625, "epoch": 2.2411467116357504, "grad_norm": 20.40708241683717, "kl": 0.55859375, "learning_rate": 5.524451939291737e-07, "loss": 0.0006, "reward": 3.3399040699005127, "reward_std": 0.1821693703532219, "rewards/final_reward": 1.5421490310798234, "rewards/mask_iou_reward": 0.7710745155399117, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3399040699005127, "rewards/thk_ans_format_reward": 1.0, "step": 1327, "think_completion_length": 38.0625 }, { "clip_ratio": 0.0, "completion_length": 104.46875, "epoch": 2.24283305227656, "grad_norm": 7.464542519723734, "kl": 0.580078125, "learning_rate": 5.521079258010118e-07, "loss": 0.0006, "reward": 3.447382092475891, "reward_std": 0.2822858989238739, "rewards/final_reward": 1.7969582167403346, "rewards/mask_iou_reward": 0.8984791083701673, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4473822116851807, "rewards/thk_ans_format_reward": 1.0, "step": 1328, "think_completion_length": 35.4375 }, { "clip_ratio": 0.0, "completion_length": 110.890625, "epoch": 2.2445193929173692, "grad_norm": 6.307277165051091, "kl": 0.5390625, "learning_rate": 5.5177065767285e-07, "loss": 0.0005, "reward": 2.7995030879974365, "reward_std": 0.02789947483688593, "rewards/final_reward": 0.12901411624845935, "rewards/mask_iou_reward": 0.06450705812422967, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7995031476020813, "rewards/thk_ans_format_reward": 1.0, "step": 1329, "think_completion_length": 43.53125 }, { "clip_ratio": 0.0, "completion_length": 109.96875, "epoch": 2.246205733558179, "grad_norm": 24.16342895469795, "kl": 0.501953125, "learning_rate": 5.514333895446879e-07, "loss": 0.0005, "reward": 3.5670766830444336, "reward_std": 0.13270878046751022, "rewards/final_reward": 1.3634549278573478, "rewards/mask_iou_reward": 0.6817274639286739, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.567076563835144, "rewards/thk_ans_format_reward": 1.0, "step": 1330, "think_completion_length": 37.59375 }, { "clip_ratio": 0.0, "completion_length": 130.359375, "epoch": 2.247892074198988, "grad_norm": 16.82568910254876, "kl": 0.541015625, "learning_rate": 5.51096121416526e-07, "loss": 0.0005, "reward": 3.664430260658264, "reward_std": 0.09643076732754707, "rewards/final_reward": 1.8536971501973039, "rewards/mask_iou_reward": 0.9268485750986519, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6644301414489746, "rewards/thk_ans_format_reward": 1.0, "step": 1331, "think_completion_length": 36.4375 }, { "clip_ratio": 0.0, "completion_length": 110.875, "epoch": 2.2495784148397977, "grad_norm": 18.817463938613194, "kl": 0.56640625, "learning_rate": 5.507588532883642e-07, "loss": 0.0006, "reward": 3.710629940032959, "reward_std": 0.06464794278144836, "rewards/final_reward": 1.9061641571540149, "rewards/mask_iou_reward": 0.9530820785770074, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.710629940032959, "rewards/thk_ans_format_reward": 1.0, "step": 1332, "think_completion_length": 40.375 }, { "clip_ratio": 0.0, "completion_length": 109.65625, "epoch": 2.251264755480607, "grad_norm": 11.400527521207344, "kl": 0.677734375, "learning_rate": 5.504215851602023e-07, "loss": 0.0007, "reward": 3.312086343765259, "reward_std": 0.20460295677185059, "rewards/final_reward": 1.68854368302632, "rewards/mask_iou_reward": 0.84427184151316, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3120863437652588, "rewards/thk_ans_format_reward": 1.0, "step": 1333, "think_completion_length": 41.9375 }, { "clip_ratio": 0.0, "completion_length": 110.515625, "epoch": 2.2529510961214165, "grad_norm": 21.22683690022595, "kl": 0.5078125, "learning_rate": 5.500843170320405e-07, "loss": 0.0005, "reward": 3.294277548789978, "reward_std": 0.022508380352519453, "rewards/final_reward": 1.325478289544912, "rewards/mask_iou_reward": 0.662739144772456, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.294277548789978, "rewards/thk_ans_format_reward": 1.0, "step": 1334, "think_completion_length": 39.59375 }, { "clip_ratio": 0.0, "completion_length": 108.515625, "epoch": 2.254637436762226, "grad_norm": 6.630398678310001, "kl": 0.60546875, "learning_rate": 5.497470489038786e-07, "loss": 0.0006, "reward": 3.027361512184143, "reward_std": 0.14319058507680893, "rewards/final_reward": 1.07448501438753, "rewards/mask_iou_reward": 0.537242507193765, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0273613929748535, "rewards/thk_ans_format_reward": 1.0, "step": 1335, "think_completion_length": 37.5625 }, { "clip_ratio": 0.0, "completion_length": 109.234375, "epoch": 2.2563237774030354, "grad_norm": 5.367508776552314, "kl": 0.546875, "learning_rate": 5.494097807757167e-07, "loss": 0.0005, "reward": 3.3002820014953613, "reward_std": 0.17817065119743347, "rewards/final_reward": 1.2256184729200479, "rewards/mask_iou_reward": 0.6128092364600239, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.300282061100006, "rewards/thk_ans_format_reward": 1.0, "step": 1336, "think_completion_length": 38.03125 }, { "clip_ratio": 0.0, "completion_length": 106.515625, "epoch": 2.258010118043845, "grad_norm": 11.956116301646125, "kl": 0.48046875, "learning_rate": 5.490725126475549e-07, "loss": 0.0005, "reward": 3.174812436103821, "reward_std": 0.3088492304086685, "rewards/final_reward": 1.0705311364499366, "rewards/mask_iou_reward": 0.5352655682249683, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.174812376499176, "rewards/thk_ans_format_reward": 1.0, "step": 1337, "think_completion_length": 39.28125 }, { "clip_ratio": 0.0, "completion_length": 110.6875, "epoch": 2.259696458684654, "grad_norm": 9.85948051241923, "kl": 0.53515625, "learning_rate": 5.487352445193929e-07, "loss": 0.0005, "reward": 3.329120635986328, "reward_std": 0.1026376448571682, "rewards/final_reward": 1.8027696973468261, "rewards/mask_iou_reward": 0.9013848486734131, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3291206359863281, "rewards/thk_ans_format_reward": 1.0, "step": 1338, "think_completion_length": 36.25 }, { "clip_ratio": 0.0, "completion_length": 108.46875, "epoch": 2.261382799325464, "grad_norm": 14.780010393420275, "kl": 0.638671875, "learning_rate": 5.483979763912309e-07, "loss": 0.0006, "reward": 3.4766530990600586, "reward_std": 0.07312630349770188, "rewards/final_reward": 1.3158479292079335, "rewards/mask_iou_reward": 0.6579239646039667, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.476653277873993, "rewards/thk_ans_format_reward": 1.0, "step": 1339, "think_completion_length": 40.09375 }, { "clip_ratio": 0.0, "completion_length": 108.1875, "epoch": 2.263069139966273, "grad_norm": 10.052671629886701, "kl": 0.59375, "learning_rate": 5.480607082630691e-07, "loss": 0.0006, "reward": 3.359123110771179, "reward_std": 0.1407726462930441, "rewards/final_reward": 1.7389720199769068, "rewards/mask_iou_reward": 0.8694860099884534, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3591230809688568, "rewards/thk_ans_format_reward": 1.0, "step": 1340, "think_completion_length": 40.96875 }, { "clip_ratio": 0.0, "completion_length": 106.953125, "epoch": 2.2647554806070826, "grad_norm": 9.111437858737153, "kl": 0.68359375, "learning_rate": 5.477234401349072e-07, "loss": 0.0007, "reward": 3.3291393518447876, "reward_std": 0.13854620698839426, "rewards/final_reward": 1.5084446019798792, "rewards/mask_iou_reward": 0.7542223009899396, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3291394114494324, "rewards/thk_ans_format_reward": 1.0, "step": 1341, "think_completion_length": 35.3125 }, { "clip_ratio": 0.0, "completion_length": 105.4375, "epoch": 2.2664418212478923, "grad_norm": 15.210744253675966, "kl": 0.609375, "learning_rate": 5.473861720067453e-07, "loss": 0.0005, "reward": 3.517295718193054, "reward_std": 0.1796765811741352, "rewards/final_reward": 1.1933201488455092, "rewards/mask_iou_reward": 0.5966600744227546, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5172955989837646, "rewards/thk_ans_format_reward": 1.0, "step": 1342, "think_completion_length": 34.9375 }, { "clip_ratio": 0.0, "completion_length": 107.578125, "epoch": 2.2681281618887015, "grad_norm": 13.824162919930119, "kl": 0.5390625, "learning_rate": 5.470489038785835e-07, "loss": 0.0005, "reward": 2.7883373498916626, "reward_std": 0.13726412784308195, "rewards/final_reward": 0.7340926898184925, "rewards/mask_iou_reward": 0.36704634490924626, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7883373498916626, "rewards/thk_ans_format_reward": 1.0, "step": 1343, "think_completion_length": 33.84375 }, { "clip_ratio": 0.0, "completion_length": 107.703125, "epoch": 2.269814502529511, "grad_norm": 10.862051768430439, "kl": 0.521484375, "learning_rate": 5.467116357504216e-07, "loss": 0.0005, "reward": 3.518467664718628, "reward_std": 0.0765317790210247, "rewards/final_reward": 1.8295472667170398, "rewards/mask_iou_reward": 0.9147736333585199, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5184677839279175, "rewards/thk_ans_format_reward": 1.0, "step": 1344, "think_completion_length": 38.0 }, { "clip_ratio": 0.0, "completion_length": 116.9375, "epoch": 2.2715008431703203, "grad_norm": 18.923876243262153, "kl": 0.640625, "learning_rate": 5.463743676222597e-07, "loss": 0.0006, "reward": 3.4015763998031616, "reward_std": 0.2154865264892578, "rewards/final_reward": 1.243939497565763, "rewards/mask_iou_reward": 0.6219697487828815, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4015763401985168, "rewards/thk_ans_format_reward": 1.0, "step": 1345, "think_completion_length": 34.78125 }, { "clip_ratio": 0.0, "completion_length": 110.578125, "epoch": 2.27318718381113, "grad_norm": 10.46959866402287, "kl": 0.69921875, "learning_rate": 5.460370994940978e-07, "loss": 0.0007, "reward": 3.545522689819336, "reward_std": 0.14559809491038322, "rewards/final_reward": 1.663084276520829, "rewards/mask_iou_reward": 0.8315421382604145, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5455228686332703, "rewards/thk_ans_format_reward": 1.0, "step": 1346, "think_completion_length": 37.96875 }, { "clip_ratio": 0.0, "completion_length": 101.984375, "epoch": 2.274873524451939, "grad_norm": 13.535203472597521, "kl": 0.5234375, "learning_rate": 5.456998313659359e-07, "loss": 0.0005, "reward": 3.331748127937317, "reward_std": 0.3237799145281315, "rewards/final_reward": 1.1902394554738316, "rewards/mask_iou_reward": 0.5951197277369158, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.331748127937317, "rewards/thk_ans_format_reward": 1.0, "step": 1347, "think_completion_length": 33.75 }, { "clip_ratio": 0.0, "completion_length": 108.984375, "epoch": 2.2765598650927488, "grad_norm": 8.446735116918521, "kl": 0.5400390625, "learning_rate": 5.453625632377739e-07, "loss": 0.0005, "reward": 3.732712984085083, "reward_std": 0.03800155781209469, "rewards/final_reward": 1.6502427498652197, "rewards/mask_iou_reward": 0.8251213749326098, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7327128648757935, "rewards/thk_ans_format_reward": 1.0, "step": 1348, "think_completion_length": 36.5625 }, { "clip_ratio": 0.0, "completion_length": 103.96875, "epoch": 2.2782462057335584, "grad_norm": 7.501407452838234, "kl": 0.64453125, "learning_rate": 5.450252951096121e-07, "loss": 0.0006, "reward": 2.9089930057525635, "reward_std": 0.10850898921489716, "rewards/final_reward": 0.8053101366110927, "rewards/mask_iou_reward": 0.40265506830554637, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9089928865432739, "rewards/thk_ans_format_reward": 1.0, "step": 1349, "think_completion_length": 37.46875 }, { "clip_ratio": 0.0, "completion_length": 107.125, "epoch": 2.2799325463743676, "grad_norm": 5.933995703982366, "kl": 0.57421875, "learning_rate": 5.446880269814502e-07, "loss": 0.0006, "reward": 3.5692362785339355, "reward_std": 0.30243778228759766, "rewards/final_reward": 1.56085562949582, "rewards/mask_iou_reward": 0.78042781474791, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5692362189292908, "rewards/thk_ans_format_reward": 1.0, "step": 1350, "think_completion_length": 37.125 }, { "clip_ratio": 0.0, "completion_length": 109.71875, "epoch": 2.2816188870151772, "grad_norm": 16.075818615178704, "kl": 0.576171875, "learning_rate": 5.443507588532883e-07, "loss": 0.0006, "reward": 2.991969585418701, "reward_std": 0.3233000710606575, "rewards/final_reward": 1.320913964818911, "rewards/mask_iou_reward": 0.6604569824094555, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9919696748256683, "rewards/thk_ans_format_reward": 1.0, "step": 1351, "think_completion_length": 35.59375 }, { "clip_ratio": 0.0, "completion_length": 102.84375, "epoch": 2.2833052276559864, "grad_norm": 6.332802892368575, "kl": 0.5859375, "learning_rate": 5.440134907251265e-07, "loss": 0.0006, "reward": 3.4482442140579224, "reward_std": 0.22082431614398956, "rewards/final_reward": 1.2399281795165056, "rewards/mask_iou_reward": 0.6199640897582528, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.448244333267212, "rewards/thk_ans_format_reward": 1.0, "step": 1352, "think_completion_length": 34.3125 }, { "clip_ratio": 0.0, "completion_length": 110.53125, "epoch": 2.284991568296796, "grad_norm": 10.536058948852574, "kl": 0.517578125, "learning_rate": 5.436762225969646e-07, "loss": 0.0005, "reward": 3.4116357564926147, "reward_std": 0.3409469872713089, "rewards/final_reward": 1.5051610832476137, "rewards/mask_iou_reward": 0.7525805416238068, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4116357564926147, "rewards/thk_ans_format_reward": 1.0, "step": 1353, "think_completion_length": 36.46875 }, { "clip_ratio": 0.0, "completion_length": 119.234375, "epoch": 2.2866779089376053, "grad_norm": 8.984527740459715, "kl": 0.54296875, "learning_rate": 5.433389544688026e-07, "loss": 0.0006, "reward": 3.5945018529891968, "reward_std": 0.08709852397441864, "rewards/final_reward": 1.6458639418072991, "rewards/mask_iou_reward": 0.8229319709036496, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5945017337799072, "rewards/thk_ans_format_reward": 1.0, "step": 1354, "think_completion_length": 36.9375 }, { "clip_ratio": 0.0, "completion_length": 106.859375, "epoch": 2.288364249578415, "grad_norm": 10.983386192174155, "kl": 0.578125, "learning_rate": 5.430016863406408e-07, "loss": 0.0006, "reward": 3.6098439693450928, "reward_std": 0.04654739610850811, "rewards/final_reward": 1.3116819439689924, "rewards/mask_iou_reward": 0.6558409719844962, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6098440289497375, "rewards/thk_ans_format_reward": 1.0, "step": 1355, "think_completion_length": 34.375 }, { "clip_ratio": 0.0, "completion_length": 109.640625, "epoch": 2.2900505902192245, "grad_norm": 5.577383873072806, "kl": 0.533203125, "learning_rate": 5.426644182124789e-07, "loss": 0.0005, "reward": 3.821853280067444, "reward_std": 0.02090123761445284, "rewards/final_reward": 1.8809040738056706, "rewards/mask_iou_reward": 0.9404520369028353, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8218533396720886, "rewards/thk_ans_format_reward": 1.0, "step": 1356, "think_completion_length": 37.25 }, { "clip_ratio": 0.0, "completion_length": 112.078125, "epoch": 2.2917369308600337, "grad_norm": 12.746405765782752, "kl": 0.55859375, "learning_rate": 5.423271500843169e-07, "loss": 0.0006, "reward": 3.242617607116699, "reward_std": 0.08481218665838242, "rewards/final_reward": 0.8098932419567305, "rewards/mask_iou_reward": 0.40494662097836526, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2426177263259888, "rewards/thk_ans_format_reward": 1.0, "step": 1357, "think_completion_length": 38.8125 }, { "clip_ratio": 0.0, "completion_length": 106.71875, "epoch": 2.2934232715008434, "grad_norm": 6.137303940016678, "kl": 0.55078125, "learning_rate": 5.419898819561551e-07, "loss": 0.0005, "reward": 3.6336123943328857, "reward_std": 0.1637212010100484, "rewards/final_reward": 1.5722296014889336, "rewards/mask_iou_reward": 0.7861148007444668, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.633612334728241, "rewards/thk_ans_format_reward": 1.0, "step": 1358, "think_completion_length": 35.53125 }, { "clip_ratio": 0.0, "completion_length": 107.875, "epoch": 2.2951096121416525, "grad_norm": 8.523910222897978, "kl": 0.521484375, "learning_rate": 5.416526138279932e-07, "loss": 0.0005, "reward": 2.9611575603485107, "reward_std": 0.5413458049297333, "rewards/final_reward": 0.999535242586827, "rewards/mask_iou_reward": 0.4997676212934135, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9611575305461884, "rewards/thk_ans_format_reward": 1.0, "step": 1359, "think_completion_length": 39.6875 }, { "clip_ratio": 0.0, "completion_length": 110.703125, "epoch": 2.296795952782462, "grad_norm": 23.755939541075783, "kl": 0.52734375, "learning_rate": 5.413153456998314e-07, "loss": 0.0005, "reward": 3.1955270767211914, "reward_std": 0.1914454996585846, "rewards/final_reward": 0.6409054223441232, "rewards/mask_iou_reward": 0.3204527111720616, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1955272555351257, "rewards/thk_ans_format_reward": 1.0, "step": 1360, "think_completion_length": 34.625 }, { "clip_ratio": 0.0, "completion_length": 105.890625, "epoch": 2.2984822934232714, "grad_norm": 5.799952126679054, "kl": 0.64453125, "learning_rate": 5.409780775716695e-07, "loss": 0.0007, "reward": 2.923759698867798, "reward_std": 0.3102700114250183, "rewards/final_reward": 0.4870952352978929, "rewards/mask_iou_reward": 0.24354761764894645, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9237594902515411, "rewards/thk_ans_format_reward": 1.0, "step": 1361, "think_completion_length": 35.875 }, { "clip_ratio": 0.0, "completion_length": 110.046875, "epoch": 2.300168634064081, "grad_norm": 8.073495621179406, "kl": 0.55859375, "learning_rate": 5.406408094435076e-07, "loss": 0.0006, "reward": 3.1795326471328735, "reward_std": 0.15434248000383377, "rewards/final_reward": 0.7367230781079752, "rewards/mask_iou_reward": 0.3683615390539876, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1795325875282288, "rewards/thk_ans_format_reward": 1.0, "step": 1362, "think_completion_length": 40.0625 }, { "clip_ratio": 0.0, "completion_length": 110.46875, "epoch": 2.30185497470489, "grad_norm": 34.4840939789534, "kl": 0.5546875, "learning_rate": 5.403035413153457e-07, "loss": 0.0006, "reward": 3.4326142072677612, "reward_std": 0.19122769800014794, "rewards/final_reward": 1.3455692918990243, "rewards/mask_iou_reward": 0.6727846459495122, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.432614266872406, "rewards/thk_ans_format_reward": 1.0, "step": 1363, "think_completion_length": 41.28125 }, { "clip_ratio": 0.0, "completion_length": 109.0625, "epoch": 2.3035413153457, "grad_norm": 9.345827782662276, "kl": 0.5, "learning_rate": 5.399662731871838e-07, "loss": 0.0005, "reward": 3.6630271673202515, "reward_std": 0.07718203030526638, "rewards/final_reward": 1.6574970505074886, "rewards/mask_iou_reward": 0.8287485252537443, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6630271077156067, "rewards/thk_ans_format_reward": 1.0, "step": 1364, "think_completion_length": 36.53125 }, { "clip_ratio": 0.0, "completion_length": 127.78125, "epoch": 2.305227655986509, "grad_norm": 11.544908340862662, "kl": 0.494140625, "learning_rate": 5.396290050590219e-07, "loss": 0.0005, "reward": 3.705686330795288, "reward_std": 0.05781856086105108, "rewards/final_reward": 1.90387979568909, "rewards/mask_iou_reward": 0.951939897844545, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7056862711906433, "rewards/thk_ans_format_reward": 1.0, "step": 1365, "think_completion_length": 34.03125 }, { "clip_ratio": 0.0, "completion_length": 108.359375, "epoch": 2.3069139966273187, "grad_norm": 13.856901701164764, "kl": 0.61328125, "learning_rate": 5.3929173693086e-07, "loss": 0.0006, "reward": 3.1082570552825928, "reward_std": 0.1994151696562767, "rewards/final_reward": 1.4614486050282012, "rewards/mask_iou_reward": 0.7307243025141006, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1082570552825928, "rewards/thk_ans_format_reward": 1.0, "step": 1366, "think_completion_length": 37.0 }, { "clip_ratio": 0.0, "completion_length": 107.125, "epoch": 2.3086003372681283, "grad_norm": 10.279635984822152, "kl": 0.53125, "learning_rate": 5.389544688026981e-07, "loss": 0.0005, "reward": 3.41591477394104, "reward_std": 0.1839870810508728, "rewards/final_reward": 1.6761410901629847, "rewards/mask_iou_reward": 0.8380705450814924, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4159146547317505, "rewards/thk_ans_format_reward": 1.0, "step": 1367, "think_completion_length": 39.53125 }, { "clip_ratio": 0.0, "completion_length": 106.859375, "epoch": 2.3102866779089375, "grad_norm": 9.9545793157659, "kl": 0.693359375, "learning_rate": 5.386172006745362e-07, "loss": 0.0007, "reward": 3.6783969402313232, "reward_std": 0.2630709856748581, "rewards/final_reward": 1.4409517987452778, "rewards/mask_iou_reward": 0.7204758993726389, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6783969402313232, "rewards/thk_ans_format_reward": 1.0, "step": 1368, "think_completion_length": 39.78125 }, { "clip_ratio": 0.0, "completion_length": 107.640625, "epoch": 2.311973018549747, "grad_norm": 6.504769492113551, "kl": 0.65234375, "learning_rate": 5.382799325463744e-07, "loss": 0.0007, "reward": 3.0724406242370605, "reward_std": 0.21577691286802292, "rewards/final_reward": 1.3656826390766597, "rewards/mask_iou_reward": 0.6828413195383298, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0724405646324158, "rewards/thk_ans_format_reward": 1.0, "step": 1369, "think_completion_length": 35.375 }, { "clip_ratio": 0.0, "completion_length": 106.0, "epoch": 2.3136593591905563, "grad_norm": 22.322142442582493, "kl": 0.552734375, "learning_rate": 5.379426644182125e-07, "loss": 0.0005, "reward": 3.6724932193756104, "reward_std": 0.031013024039566517, "rewards/final_reward": 1.6675560661103672, "rewards/mask_iou_reward": 0.8337780330551836, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6724932789802551, "rewards/thk_ans_format_reward": 1.0, "step": 1370, "think_completion_length": 36.3125 }, { "clip_ratio": 0.0, "completion_length": 105.171875, "epoch": 2.315345699831366, "grad_norm": 8.858965352477439, "kl": 0.83203125, "learning_rate": 5.376053962900505e-07, "loss": 0.0008, "reward": 3.5324264764785767, "reward_std": 0.1711833318695426, "rewards/final_reward": 1.4827722485784687, "rewards/mask_iou_reward": 0.7413861242892343, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5324264168739319, "rewards/thk_ans_format_reward": 1.0, "step": 1371, "think_completion_length": 32.375 }, { "clip_ratio": 0.0, "completion_length": 102.828125, "epoch": 2.317032040472175, "grad_norm": 7.364348393792841, "kl": 0.5859375, "learning_rate": 5.372681281618887e-07, "loss": 0.0006, "reward": 3.3787490129470825, "reward_std": 0.01813412643969059, "rewards/final_reward": 0.9400155006785603, "rewards/mask_iou_reward": 0.47000775033928016, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.378748893737793, "rewards/thk_ans_format_reward": 1.0, "step": 1372, "think_completion_length": 33.40625 }, { "clip_ratio": 0.0, "completion_length": 107.0625, "epoch": 2.318718381112985, "grad_norm": 12.402060150068934, "kl": 0.5859375, "learning_rate": 5.369308600337268e-07, "loss": 0.0006, "reward": 3.37683367729187, "reward_std": 0.23856448754668236, "rewards/final_reward": 1.6650647820314266, "rewards/mask_iou_reward": 0.8325323910157133, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3768335580825806, "rewards/thk_ans_format_reward": 1.0, "step": 1373, "think_completion_length": 35.5 }, { "clip_ratio": 0.0, "completion_length": 116.46875, "epoch": 2.3204047217537944, "grad_norm": 8.192072648544633, "kl": 0.578125, "learning_rate": 5.365935919055648e-07, "loss": 0.0006, "reward": 2.7094311714172363, "reward_std": 0.2052406631410122, "rewards/final_reward": 0.04370178499354055, "rewards/mask_iou_reward": 0.021850892496770274, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7094311714172363, "rewards/thk_ans_format_reward": 1.0, "step": 1374, "think_completion_length": 42.4375 }, { "clip_ratio": 0.0, "completion_length": 109.875, "epoch": 2.3220910623946036, "grad_norm": 30.616425561825096, "kl": 0.59375, "learning_rate": 5.36256323777403e-07, "loss": 0.0006, "reward": 3.477543830871582, "reward_std": 0.17288058251142502, "rewards/final_reward": 1.4923323475422399, "rewards/mask_iou_reward": 0.7461661737711199, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4775438904762268, "rewards/thk_ans_format_reward": 1.0, "step": 1375, "think_completion_length": 38.875 }, { "clip_ratio": 0.0, "completion_length": 106.359375, "epoch": 2.3237774030354132, "grad_norm": 9.024795177986041, "kl": 0.580078125, "learning_rate": 5.359190556492411e-07, "loss": 0.0006, "reward": 3.503230929374695, "reward_std": 0.1244723740965128, "rewards/final_reward": 1.5993782177815818, "rewards/mask_iou_reward": 0.7996891088907909, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5032309293746948, "rewards/thk_ans_format_reward": 1.0, "step": 1376, "think_completion_length": 38.375 }, { "clip_ratio": 0.0, "completion_length": 103.140625, "epoch": 2.3254637436762224, "grad_norm": 6.357890399165426, "kl": 0.533203125, "learning_rate": 5.355817875210792e-07, "loss": 0.0005, "reward": 3.3854854106903076, "reward_std": 0.2610369510948658, "rewards/final_reward": 1.3946295125890742, "rewards/mask_iou_reward": 0.6973147562945371, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3854854702949524, "rewards/thk_ans_format_reward": 1.0, "step": 1377, "think_completion_length": 35.03125 }, { "clip_ratio": 0.0, "completion_length": 106.53125, "epoch": 2.327150084317032, "grad_norm": 11.59512324924954, "kl": 0.54296875, "learning_rate": 5.352445193929174e-07, "loss": 0.0005, "reward": 3.6056227684020996, "reward_std": 0.031252100598067045, "rewards/final_reward": 1.8470167898649712, "rewards/mask_iou_reward": 0.9235083949324856, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6056227684020996, "rewards/thk_ans_format_reward": 1.0, "step": 1378, "think_completion_length": 32.09375 }, { "clip_ratio": 0.0, "completion_length": 106.359375, "epoch": 2.3288364249578413, "grad_norm": 17.104449464900775, "kl": 0.52734375, "learning_rate": 5.349072512647554e-07, "loss": 0.0005, "reward": 3.4591941833496094, "reward_std": 0.3491174578666687, "rewards/final_reward": 1.3752283871348885, "rewards/mask_iou_reward": 0.6876141935674442, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4591941833496094, "rewards/thk_ans_format_reward": 1.0, "step": 1379, "think_completion_length": 37.03125 }, { "clip_ratio": 0.0, "completion_length": 107.15625, "epoch": 2.330522765598651, "grad_norm": 15.730667505763513, "kl": 0.5205078125, "learning_rate": 5.345699831365935e-07, "loss": 0.0005, "reward": 3.54481840133667, "reward_std": 0.047257980331778526, "rewards/final_reward": 1.5888374056095371, "rewards/mask_iou_reward": 0.7944187028047686, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5448181629180908, "rewards/thk_ans_format_reward": 1.0, "step": 1380, "think_completion_length": 35.3125 }, { "clip_ratio": 0.0, "completion_length": 106.9375, "epoch": 2.3322091062394605, "grad_norm": 7.4373469061295445, "kl": 0.65625, "learning_rate": 5.342327150084317e-07, "loss": 0.0007, "reward": 2.963295817375183, "reward_std": 0.29863504134118557, "rewards/final_reward": 1.3623242165591476, "rewards/mask_iou_reward": 0.6811621082795738, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9632959961891174, "rewards/thk_ans_format_reward": 1.0, "step": 1381, "think_completion_length": 33.40625 }, { "clip_ratio": 0.0, "completion_length": 121.125, "epoch": 2.3338954468802697, "grad_norm": 7.16589243158659, "kl": 0.56640625, "learning_rate": 5.338954468802698e-07, "loss": 0.0006, "reward": 2.9298386573791504, "reward_std": 0.012405174784362316, "rewards/final_reward": 0.9121590290456439, "rewards/mask_iou_reward": 0.45607951452282197, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9298387765884399, "rewards/thk_ans_format_reward": 1.0, "step": 1382, "think_completion_length": 35.625 }, { "clip_ratio": 0.0, "completion_length": 103.53125, "epoch": 2.3355817875210794, "grad_norm": 10.52957536273546, "kl": 0.625, "learning_rate": 5.33558178752108e-07, "loss": 0.0006, "reward": 3.5053229331970215, "reward_std": 0.2050390988588333, "rewards/final_reward": 1.8158258342784386, "rewards/mask_iou_reward": 0.9079129171392193, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5053229928016663, "rewards/thk_ans_format_reward": 1.0, "step": 1383, "think_completion_length": 33.28125 }, { "clip_ratio": 0.0, "completion_length": 107.5, "epoch": 2.3372681281618886, "grad_norm": 6.731365865114964, "kl": 0.560546875, "learning_rate": 5.33220910623946e-07, "loss": 0.0006, "reward": 2.6403086185455322, "reward_std": 0.09763676300644875, "rewards/final_reward": 0.785273539529477, "rewards/mask_iou_reward": 0.3926367697647385, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6403087079524994, "rewards/thk_ans_format_reward": 1.0, "step": 1384, "think_completion_length": 42.9375 }, { "clip_ratio": 0.0, "completion_length": 109.65625, "epoch": 2.338954468802698, "grad_norm": 6.818050851838744, "kl": 0.5400390625, "learning_rate": 5.328836424957841e-07, "loss": 0.0005, "reward": 3.666144609451294, "reward_std": 0.0915786512196064, "rewards/final_reward": 1.7479222751329424, "rewards/mask_iou_reward": 0.8739611375664712, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6661444902420044, "rewards/thk_ans_format_reward": 1.0, "step": 1385, "think_completion_length": 36.34375 }, { "clip_ratio": 0.0, "completion_length": 109.453125, "epoch": 2.3406408094435074, "grad_norm": 4.538761263469013, "kl": 0.564453125, "learning_rate": 5.325463743676223e-07, "loss": 0.0006, "reward": 3.2571221590042114, "reward_std": 0.03687620488926768, "rewards/final_reward": 0.650422243607985, "rewards/mask_iou_reward": 0.3252111218039925, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2571222186088562, "rewards/thk_ans_format_reward": 1.0, "step": 1386, "think_completion_length": 38.34375 }, { "clip_ratio": 0.0, "completion_length": 99.21875, "epoch": 2.342327150084317, "grad_norm": 14.347621308169337, "kl": 0.509765625, "learning_rate": 5.322091062394604e-07, "loss": 0.0005, "reward": 3.35020649433136, "reward_std": 0.33967210724949837, "rewards/final_reward": 1.5729735712665027, "rewards/mask_iou_reward": 0.7864867856332514, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3502064943313599, "rewards/thk_ans_format_reward": 1.0, "step": 1387, "think_completion_length": 40.3125 }, { "clip_ratio": 0.0, "completion_length": 119.34375, "epoch": 2.3440134907251267, "grad_norm": 15.164151268757067, "kl": 0.51953125, "learning_rate": 5.318718381112984e-07, "loss": 0.0005, "reward": 3.322817802429199, "reward_std": 0.27229122817516327, "rewards/final_reward": 1.2874370263274848, "rewards/mask_iou_reward": 0.6437185131637424, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3228177428245544, "rewards/thk_ans_format_reward": 1.0, "step": 1388, "think_completion_length": 38.75 }, { "clip_ratio": 0.0, "completion_length": 108.9375, "epoch": 2.345699831365936, "grad_norm": 11.567337138167554, "kl": 0.576171875, "learning_rate": 5.315345699831366e-07, "loss": 0.0006, "reward": 3.3506091833114624, "reward_std": 0.03909984044730663, "rewards/final_reward": 0.9120567229378608, "rewards/mask_iou_reward": 0.4560283614689304, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3506091237068176, "rewards/thk_ans_format_reward": 1.0, "step": 1389, "think_completion_length": 39.71875 }, { "clip_ratio": 0.0, "completion_length": 102.0625, "epoch": 2.3473861720067455, "grad_norm": 27.705185381349548, "kl": 0.59765625, "learning_rate": 5.311973018549747e-07, "loss": 0.0006, "reward": 3.444739580154419, "reward_std": 0.282623004168272, "rewards/final_reward": 0.9685749597290699, "rewards/mask_iou_reward": 0.48428747986453496, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4447395205497742, "rewards/thk_ans_format_reward": 1.0, "step": 1390, "think_completion_length": 34.15625 }, { "clip_ratio": 0.0, "completion_length": 108.078125, "epoch": 2.3490725126475547, "grad_norm": 10.792504700501967, "kl": 0.4921875, "learning_rate": 5.308600337268128e-07, "loss": 0.0005, "reward": 2.987342119216919, "reward_std": 0.561201810836792, "rewards/final_reward": 1.012290295770121, "rewards/mask_iou_reward": 0.5061451478850605, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9873422086238861, "rewards/thk_ans_format_reward": 1.0, "step": 1391, "think_completion_length": 45.46875 }, { "clip_ratio": 0.0, "completion_length": 106.421875, "epoch": 2.3507588532883643, "grad_norm": 7.9102889207307685, "kl": 0.546875, "learning_rate": 5.30522765598651e-07, "loss": 0.0006, "reward": 3.211172580718994, "reward_std": 0.06296231271699071, "rewards/final_reward": 1.4353681768739848, "rewards/mask_iou_reward": 0.7176840884369924, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2111724019050598, "rewards/thk_ans_format_reward": 1.0, "step": 1392, "think_completion_length": 37.625 }, { "clip_ratio": 0.0, "completion_length": 145.5, "epoch": 2.3524451939291735, "grad_norm": 8.891253140343897, "kl": 0.5625, "learning_rate": 5.30185497470489e-07, "loss": 0.0006, "reward": 3.3555647134780884, "reward_std": 0.22218644618988037, "rewards/final_reward": 1.2452854457618507, "rewards/mask_iou_reward": 0.6226427228809254, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3555646538734436, "rewards/thk_ans_format_reward": 1.0, "step": 1393, "think_completion_length": 35.25 }, { "clip_ratio": 0.0, "completion_length": 105.609375, "epoch": 2.354131534569983, "grad_norm": 11.443989159249242, "kl": 0.54296875, "learning_rate": 5.298482293423271e-07, "loss": 0.0005, "reward": 3.45237934589386, "reward_std": 0.16156933456659317, "rewards/final_reward": 1.792342337405704, "rewards/mask_iou_reward": 0.896171168702852, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.452379286289215, "rewards/thk_ans_format_reward": 1.0, "step": 1394, "think_completion_length": 36.75 }, { "clip_ratio": 0.0, "completion_length": 109.515625, "epoch": 2.3558178752107928, "grad_norm": 8.3765850159821, "kl": 0.67578125, "learning_rate": 5.295109612141653e-07, "loss": 0.0007, "reward": 3.2284774780273438, "reward_std": 0.22332235658541322, "rewards/final_reward": 1.5533457099388035, "rewards/mask_iou_reward": 0.7766728549694017, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2284774482250214, "rewards/thk_ans_format_reward": 1.0, "step": 1395, "think_completion_length": 39.0 }, { "clip_ratio": 0.0, "completion_length": 109.015625, "epoch": 2.357504215851602, "grad_norm": 17.58200086525832, "kl": 0.54296875, "learning_rate": 5.291736930860033e-07, "loss": 0.0005, "reward": 3.1018285751342773, "reward_std": 0.35995006561279297, "rewards/final_reward": 1.0164651908095734, "rewards/mask_iou_reward": 0.5082325954047867, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1018284559249878, "rewards/thk_ans_format_reward": 1.0, "step": 1396, "think_completion_length": 37.03125 }, { "clip_ratio": 0.0, "completion_length": 117.078125, "epoch": 2.3591905564924116, "grad_norm": 22.290024986668044, "kl": 0.52734375, "learning_rate": 5.288364249578414e-07, "loss": 0.0005, "reward": 2.9656145572662354, "reward_std": 0.2921905219554901, "rewards/final_reward": 0.6604535228611185, "rewards/mask_iou_reward": 0.33022676143055923, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9656146168708801, "rewards/thk_ans_format_reward": 1.0, "step": 1397, "think_completion_length": 35.4375 }, { "clip_ratio": 0.0, "completion_length": 106.53125, "epoch": 2.360876897133221, "grad_norm": 3.9345173173490138, "kl": 0.541015625, "learning_rate": 5.284991568296796e-07, "loss": 0.0006, "reward": 3.059117913246155, "reward_std": 0.005253995528619271, "rewards/final_reward": 1.965131045395031, "rewards/mask_iou_reward": 0.9825655226975155, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0591178834438324, "rewards/thk_ans_format_reward": 1.0, "step": 1398, "think_completion_length": 37.0625 }, { "clip_ratio": 0.0, "completion_length": 95.875, "epoch": 2.3625632377740304, "grad_norm": 7.9093384724318865, "kl": 0.568359375, "learning_rate": 5.281618887015177e-07, "loss": 0.0006, "reward": 3.475532650947571, "reward_std": 0.1894139125943184, "rewards/final_reward": 1.1372858494947948, "rewards/mask_iou_reward": 0.5686429247473974, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4755328297615051, "rewards/thk_ans_format_reward": 1.0, "step": 1399, "think_completion_length": 39.46875 }, { "clip_ratio": 0.0, "completion_length": 106.484375, "epoch": 2.3642495784148396, "grad_norm": 6.224619860019826, "kl": 0.564453125, "learning_rate": 5.278246205733558e-07, "loss": 0.0006, "reward": 2.9742729663848877, "reward_std": 0.09473420679569244, "rewards/final_reward": 0.23003473812021966, "rewards/mask_iou_reward": 0.11501736906010983, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9742728471755981, "rewards/thk_ans_format_reward": 1.0, "step": 1400, "think_completion_length": 35.1875 }, { "clip_ratio": 0.0, "completion_length": 116.46875, "epoch": 2.3659359190556493, "grad_norm": 10.832806018074743, "kl": 0.50390625, "learning_rate": 5.27487352445194e-07, "loss": 0.0005, "reward": 3.378462791442871, "reward_std": 0.21569720469415188, "rewards/final_reward": 1.5557822843293696, "rewards/mask_iou_reward": 0.7778911421646848, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.378462791442871, "rewards/thk_ans_format_reward": 1.0, "step": 1401, "think_completion_length": 38.25 }, { "clip_ratio": 0.0, "completion_length": 121.046875, "epoch": 2.367622259696459, "grad_norm": 7.501941757157272, "kl": 0.5859375, "learning_rate": 5.27150084317032e-07, "loss": 0.0006, "reward": 3.699384927749634, "reward_std": 0.06814133375883102, "rewards/final_reward": 1.5414973015209585, "rewards/mask_iou_reward": 0.7707486507604793, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6993848085403442, "rewards/thk_ans_format_reward": 1.0, "step": 1402, "think_completion_length": 37.84375 }, { "clip_ratio": 0.0, "completion_length": 106.578125, "epoch": 2.369308600337268, "grad_norm": 7.5412459195508035, "kl": 0.572265625, "learning_rate": 5.268128161888701e-07, "loss": 0.0006, "reward": 3.515091300010681, "reward_std": 0.18088901042938232, "rewards/final_reward": 1.3727902363585298, "rewards/mask_iou_reward": 0.6863951181792649, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5150913000106812, "rewards/thk_ans_format_reward": 1.0, "step": 1403, "think_completion_length": 33.96875 }, { "clip_ratio": 0.0, "completion_length": 108.9375, "epoch": 2.3709949409780777, "grad_norm": 9.174490098877436, "kl": 0.56640625, "learning_rate": 5.264755480607082e-07, "loss": 0.0006, "reward": 2.920655369758606, "reward_std": 0.1659610359929502, "rewards/final_reward": 1.3589814228169435, "rewards/mask_iou_reward": 0.6794907114084717, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9206554293632507, "rewards/thk_ans_format_reward": 1.0, "step": 1404, "think_completion_length": 36.34375 }, { "clip_ratio": 0.0, "completion_length": 102.609375, "epoch": 2.372681281618887, "grad_norm": 5.524729092924367, "kl": 0.61328125, "learning_rate": 5.261382799325463e-07, "loss": 0.0006, "reward": 3.408183217048645, "reward_std": 0.11358396708965302, "rewards/final_reward": 1.3904137277971458, "rewards/mask_iou_reward": 0.6952068638985729, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4081830978393555, "rewards/thk_ans_format_reward": 1.0, "step": 1405, "think_completion_length": 33.875 }, { "clip_ratio": 0.0, "completion_length": 107.984375, "epoch": 2.3743676222596966, "grad_norm": 9.897592505644342, "kl": 0.568359375, "learning_rate": 5.258010118043844e-07, "loss": 0.0006, "reward": 3.7648085355758667, "reward_std": 0.1485668420791626, "rewards/final_reward": 1.8613900782667594, "rewards/mask_iou_reward": 0.9306950391333797, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7648085355758667, "rewards/thk_ans_format_reward": 1.0, "step": 1406, "think_completion_length": 38.375 }, { "clip_ratio": 0.0, "completion_length": 108.640625, "epoch": 2.3760539629005057, "grad_norm": 14.760801593326821, "kl": 0.58984375, "learning_rate": 5.254637436762226e-07, "loss": 0.0006, "reward": 3.5451170206069946, "reward_std": 0.046211473643779755, "rewards/final_reward": 1.759196898924725, "rewards/mask_iou_reward": 0.8795984494623625, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5451170802116394, "rewards/thk_ans_format_reward": 1.0, "step": 1407, "think_completion_length": 35.5625 }, { "clip_ratio": 0.0, "completion_length": 101.4375, "epoch": 2.3777403035413154, "grad_norm": 6.238961683518768, "kl": 0.7265625, "learning_rate": 5.251264755480607e-07, "loss": 0.0007, "reward": 3.763440251350403, "reward_std": 0.28513549268245697, "rewards/final_reward": 1.729252268258402, "rewards/mask_iou_reward": 0.864626134129201, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7634401321411133, "rewards/thk_ans_format_reward": 1.0, "step": 1408, "think_completion_length": 33.875 }, { "clip_ratio": 0.0, "completion_length": 112.828125, "epoch": 2.379426644182125, "grad_norm": 7.543950870667907, "kl": 0.50390625, "learning_rate": 5.247892074198989e-07, "loss": 0.0005, "reward": 3.1753127574920654, "reward_std": 0.25032037193886936, "rewards/final_reward": 1.5657431095470087, "rewards/mask_iou_reward": 0.7828715547735043, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.175312876701355, "rewards/thk_ans_format_reward": 1.0, "step": 1409, "think_completion_length": 42.53125 }, { "clip_ratio": 0.0, "completion_length": 145.578125, "epoch": 2.381112984822934, "grad_norm": 7.826650646237432, "kl": 0.533203125, "learning_rate": 5.24451939291737e-07, "loss": 0.0005, "reward": 3.2201855182647705, "reward_std": 0.26124662533402443, "rewards/final_reward": 1.5676906010913358, "rewards/mask_iou_reward": 0.7838453005456679, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2201855182647705, "rewards/thk_ans_format_reward": 1.0, "step": 1410, "think_completion_length": 37.25 }, { "clip_ratio": 0.0, "completion_length": 120.265625, "epoch": 2.382799325463744, "grad_norm": 14.172162637197918, "kl": 0.515625, "learning_rate": 5.24114671163575e-07, "loss": 0.0006, "reward": 3.4743032455444336, "reward_std": 0.19433462619781494, "rewards/final_reward": 1.671436299136126, "rewards/mask_iou_reward": 0.835718149568063, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.474303126335144, "rewards/thk_ans_format_reward": 1.0, "step": 1411, "think_completion_length": 36.6875 }, { "clip_ratio": 0.0, "completion_length": 135.75, "epoch": 2.384485666104553, "grad_norm": 13.63952704554102, "kl": 0.7275390625, "learning_rate": 5.237774030354132e-07, "loss": 0.0007, "reward": 3.3273757696151733, "reward_std": 0.11718492582440376, "rewards/final_reward": 1.339164020065489, "rewards/mask_iou_reward": 0.6695820100327445, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3273758292198181, "rewards/thk_ans_format_reward": 1.0, "step": 1412, "think_completion_length": 41.53125 }, { "clip_ratio": 0.0, "completion_length": 111.234375, "epoch": 2.3861720067453627, "grad_norm": 9.02357872456789, "kl": 0.55078125, "learning_rate": 5.234401349072512e-07, "loss": 0.0005, "reward": 3.219269037246704, "reward_std": 0.408921817317605, "rewards/final_reward": 1.103784555988953, "rewards/mask_iou_reward": 0.5518922779944765, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2192689776420593, "rewards/thk_ans_format_reward": 1.0, "step": 1413, "think_completion_length": 39.84375 }, { "clip_ratio": 0.0, "completion_length": 110.4375, "epoch": 2.387858347386172, "grad_norm": 5.269468878977596, "kl": 0.537109375, "learning_rate": 5.231028667790893e-07, "loss": 0.0005, "reward": 3.221733808517456, "reward_std": 0.255710706114769, "rewards/final_reward": 1.6817738791159162, "rewards/mask_iou_reward": 0.8408869395579581, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2217336893081665, "rewards/thk_ans_format_reward": 1.0, "step": 1414, "think_completion_length": 35.75 }, { "clip_ratio": 0.0, "completion_length": 108.65625, "epoch": 2.3895446880269815, "grad_norm": 11.668623025824209, "kl": 0.59375, "learning_rate": 5.227655986509275e-07, "loss": 0.0006, "reward": 2.9161574840545654, "reward_std": 0.1359611563384533, "rewards/final_reward": 0.3541008718656232, "rewards/mask_iou_reward": 0.1770504359328116, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9161574840545654, "rewards/thk_ans_format_reward": 1.0, "step": 1415, "think_completion_length": 37.90625 }, { "clip_ratio": 0.0, "completion_length": 110.203125, "epoch": 2.391231028667791, "grad_norm": 6.835618298256512, "kl": 0.55859375, "learning_rate": 5.224283305227656e-07, "loss": 0.0006, "reward": 2.6472376585006714, "reward_std": 0.19794801366515458, "rewards/final_reward": 0.05851637501519018, "rewards/mask_iou_reward": 0.02925818750759509, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6472376435995102, "rewards/thk_ans_format_reward": 1.0, "step": 1416, "think_completion_length": 36.71875 }, { "clip_ratio": 0.0, "completion_length": 111.71875, "epoch": 2.3929173693086003, "grad_norm": 11.735490794711714, "kl": 0.58984375, "learning_rate": 5.220910623946037e-07, "loss": 0.0006, "reward": 3.3064658641815186, "reward_std": 0.09207919798791409, "rewards/final_reward": 1.5212218066720333, "rewards/mask_iou_reward": 0.7606109033360167, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3064658045768738, "rewards/thk_ans_format_reward": 1.0, "step": 1417, "think_completion_length": 43.25 }, { "clip_ratio": 0.0, "completion_length": 106.71875, "epoch": 2.39460370994941, "grad_norm": 7.427700460484598, "kl": 0.689453125, "learning_rate": 5.217537942664419e-07, "loss": 0.0007, "reward": 3.496484875679016, "reward_std": 0.27999068424105644, "rewards/final_reward": 1.4666861887157916, "rewards/mask_iou_reward": 0.7333430943578958, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.496484637260437, "rewards/thk_ans_format_reward": 1.0, "step": 1418, "think_completion_length": 35.375 }, { "clip_ratio": 0.0, "completion_length": 107.046875, "epoch": 2.396290050590219, "grad_norm": 6.7558319839546215, "kl": 0.580078125, "learning_rate": 5.214165261382799e-07, "loss": 0.0006, "reward": 3.452078342437744, "reward_std": 0.18629483878612518, "rewards/final_reward": 1.9352354965872987, "rewards/mask_iou_reward": 0.9676177482936493, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4520783424377441, "rewards/thk_ans_format_reward": 1.0, "step": 1419, "think_completion_length": 38.59375 }, { "clip_ratio": 0.0, "completion_length": 111.515625, "epoch": 2.397976391231029, "grad_norm": 19.13059374771945, "kl": 0.55859375, "learning_rate": 5.21079258010118e-07, "loss": 0.0006, "reward": 3.172434687614441, "reward_std": 0.021215507294982672, "rewards/final_reward": 1.0803966303114596, "rewards/mask_iou_reward": 0.5401983151557298, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1724347174167633, "rewards/thk_ans_format_reward": 1.0, "step": 1420, "think_completion_length": 39.875 }, { "clip_ratio": 0.0, "completion_length": 112.4375, "epoch": 2.399662731871838, "grad_norm": 8.533750845914334, "kl": 0.572265625, "learning_rate": 5.207419898819561e-07, "loss": 0.0006, "reward": 3.4311397075653076, "reward_std": 0.12119658989831805, "rewards/final_reward": 1.1740384616766832, "rewards/mask_iou_reward": 0.5870192308383416, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.431139886379242, "rewards/thk_ans_format_reward": 1.0, "step": 1421, "think_completion_length": 44.4375 }, { "clip_ratio": 0.0, "completion_length": 107.609375, "epoch": 2.4013490725126476, "grad_norm": 8.827783645808221, "kl": 0.578125, "learning_rate": 5.204047217537942e-07, "loss": 0.0006, "reward": 3.4205384254455566, "reward_std": 0.13982452638447285, "rewards/final_reward": 1.4854884004504718, "rewards/mask_iou_reward": 0.7427442002252359, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4205384850502014, "rewards/thk_ans_format_reward": 1.0, "step": 1422, "think_completion_length": 38.4375 }, { "clip_ratio": 0.0, "completion_length": 111.328125, "epoch": 2.403035413153457, "grad_norm": 6.05465436982932, "kl": 1.525390625, "learning_rate": 5.200674536256323e-07, "loss": 0.0015, "reward": 3.809617519378662, "reward_std": 0.1390428734011948, "rewards/final_reward": 1.732256359496069, "rewards/mask_iou_reward": 0.8661281797480345, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8096173405647278, "rewards/thk_ans_format_reward": 1.0, "step": 1423, "think_completion_length": 41.46875 }, { "clip_ratio": 0.0, "completion_length": 115.90625, "epoch": 2.4047217537942664, "grad_norm": 7.545060038565714, "kl": 0.529296875, "learning_rate": 5.197301854974705e-07, "loss": 0.0005, "reward": 3.338720679283142, "reward_std": 0.09901190176606178, "rewards/final_reward": 1.2441309370313292, "rewards/mask_iou_reward": 0.6220654685156646, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3387206196784973, "rewards/thk_ans_format_reward": 1.0, "step": 1424, "think_completion_length": 38.03125 }, { "clip_ratio": 0.0, "completion_length": 104.390625, "epoch": 2.4064080944350756, "grad_norm": 8.125363416747533, "kl": 0.6171875, "learning_rate": 5.193929173693086e-07, "loss": 0.0006, "reward": 3.300279378890991, "reward_std": 0.07601998746395111, "rewards/final_reward": 1.452418104648395, "rewards/mask_iou_reward": 0.7262090523241975, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3002793192863464, "rewards/thk_ans_format_reward": 1.0, "step": 1425, "think_completion_length": 34.0625 }, { "clip_ratio": 0.0, "completion_length": 107.6875, "epoch": 2.4080944350758853, "grad_norm": 14.793698598847602, "kl": 0.552734375, "learning_rate": 5.190556492411467e-07, "loss": 0.0006, "reward": 3.349576950073242, "reward_std": 0.06855934672057629, "rewards/final_reward": 1.3882612799502339, "rewards/mask_iou_reward": 0.6941306399751169, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3495770692825317, "rewards/thk_ans_format_reward": 1.0, "step": 1426, "think_completion_length": 36.1875 }, { "clip_ratio": 0.0, "completion_length": 107.328125, "epoch": 2.409780775716695, "grad_norm": 22.392515052220187, "kl": 0.63671875, "learning_rate": 5.187183811129849e-07, "loss": 0.0006, "reward": 3.1435389518737793, "reward_std": 0.22166889160871506, "rewards/final_reward": 0.7710225803234687, "rewards/mask_iou_reward": 0.38551129016173435, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.143539011478424, "rewards/thk_ans_format_reward": 1.0, "step": 1427, "think_completion_length": 41.59375 }, { "clip_ratio": 0.0, "completion_length": 114.84375, "epoch": 2.411467116357504, "grad_norm": 5.926089505112166, "kl": 0.515625, "learning_rate": 5.183811129848229e-07, "loss": 0.0005, "reward": 3.498886823654175, "reward_std": 0.2120041623711586, "rewards/final_reward": 1.5323652788101025, "rewards/mask_iou_reward": 0.7661826394050513, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4988868236541748, "rewards/thk_ans_format_reward": 1.0, "step": 1428, "think_completion_length": 41.0 }, { "clip_ratio": 0.0, "completion_length": 111.234375, "epoch": 2.4131534569983137, "grad_norm": 6.680627025015385, "kl": 0.5703125, "learning_rate": 5.180438448566609e-07, "loss": 0.0006, "reward": 3.243508219718933, "reward_std": 0.17464113235473633, "rewards/final_reward": 1.0407643744754143, "rewards/mask_iou_reward": 0.5203821872377071, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.243508219718933, "rewards/thk_ans_format_reward": 1.0, "step": 1429, "think_completion_length": 40.9375 }, { "clip_ratio": 0.0, "completion_length": 110.921875, "epoch": 2.414839797639123, "grad_norm": 8.701499831433182, "kl": 0.568359375, "learning_rate": 5.177065767284991e-07, "loss": 0.0006, "reward": 2.999347448348999, "reward_std": 0.16424234956502914, "rewards/final_reward": 1.4692365949521713, "rewards/mask_iou_reward": 0.7346182974760856, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9993474781513214, "rewards/thk_ans_format_reward": 1.0, "step": 1430, "think_completion_length": 38.46875 }, { "clip_ratio": 0.0, "completion_length": 137.796875, "epoch": 2.4165261382799326, "grad_norm": 8.019057025347687, "kl": 0.5146484375, "learning_rate": 5.173693086003372e-07, "loss": 0.0005, "reward": 3.8532402515411377, "reward_std": 0.02663713227957487, "rewards/final_reward": 1.9138304988010595, "rewards/mask_iou_reward": 0.9569152494005297, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8532403707504272, "rewards/thk_ans_format_reward": 1.0, "step": 1431, "think_completion_length": 36.03125 }, { "clip_ratio": 0.0, "completion_length": 104.5, "epoch": 2.4182124789207418, "grad_norm": 10.521478360313901, "kl": 0.69921875, "learning_rate": 5.170320404721753e-07, "loss": 0.0007, "reward": 3.5484408140182495, "reward_std": 0.04367404989898205, "rewards/final_reward": 1.6665183138121653, "rewards/mask_iou_reward": 0.8332591569060827, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.54844069480896, "rewards/thk_ans_format_reward": 1.0, "step": 1432, "think_completion_length": 35.15625 }, { "clip_ratio": 0.0, "completion_length": 139.40625, "epoch": 2.4198988195615514, "grad_norm": 11.393237382787158, "kl": 0.5859375, "learning_rate": 5.166947723440135e-07, "loss": 0.0006, "reward": 3.6145013570785522, "reward_std": 0.08510691672563553, "rewards/final_reward": 1.6453253678397484, "rewards/mask_iou_reward": 0.8226626839198742, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6145014762878418, "rewards/thk_ans_format_reward": 1.0, "step": 1433, "think_completion_length": 37.3125 }, { "clip_ratio": 0.0, "completion_length": 112.71875, "epoch": 2.421585160202361, "grad_norm": 12.955716447139663, "kl": 0.517578125, "learning_rate": 5.163575042158516e-07, "loss": 0.0005, "reward": 3.325462579727173, "reward_std": 0.31851503252983093, "rewards/final_reward": 1.244566120650251, "rewards/mask_iou_reward": 0.6222830603251255, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.325462520122528, "rewards/thk_ans_format_reward": 1.0, "step": 1434, "think_completion_length": 39.5 }, { "clip_ratio": 0.0, "completion_length": 107.796875, "epoch": 2.4232715008431702, "grad_norm": 26.17046760870296, "kl": 0.58984375, "learning_rate": 5.160202360876898e-07, "loss": 0.0006, "reward": 3.147235631942749, "reward_std": 0.26723285019397736, "rewards/final_reward": 1.2566699240865904, "rewards/mask_iou_reward": 0.6283349620432952, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1472358107566833, "rewards/thk_ans_format_reward": 1.0, "step": 1435, "think_completion_length": 37.75 }, { "clip_ratio": 0.0, "completion_length": 105.765625, "epoch": 2.42495784148398, "grad_norm": 8.568795937105243, "kl": 0.615234375, "learning_rate": 5.156829679595279e-07, "loss": 0.0006, "reward": 3.0845930576324463, "reward_std": 0.33267538249492645, "rewards/final_reward": 1.0768244291067677, "rewards/mask_iou_reward": 0.5384122145533838, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.1158430576324463, "rewards/thk_ans_format_reward": 0.984375, "step": 1436, "think_completion_length": 37.4375 }, { "clip_ratio": 0.0, "completion_length": 109.015625, "epoch": 2.426644182124789, "grad_norm": 86.30501173717676, "kl": 0.556640625, "learning_rate": 5.153456998313658e-07, "loss": 0.0005, "reward": 3.022960066795349, "reward_std": 0.30731740966439247, "rewards/final_reward": 1.3756990503386066, "rewards/mask_iou_reward": 0.6878495251693033, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0229600071907043, "rewards/thk_ans_format_reward": 1.0, "step": 1437, "think_completion_length": 39.03125 }, { "clip_ratio": 0.0, "completion_length": 107.09375, "epoch": 2.4283305227655987, "grad_norm": 19.65157587762331, "kl": 0.6171875, "learning_rate": 5.15008431703204e-07, "loss": 0.0006, "reward": 3.5178322792053223, "reward_std": 0.22085876762866974, "rewards/final_reward": 1.7174181471389547, "rewards/mask_iou_reward": 0.8587090735694773, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.517832338809967, "rewards/thk_ans_format_reward": 1.0, "step": 1438, "think_completion_length": 38.21875 }, { "clip_ratio": 0.0, "completion_length": 108.375, "epoch": 2.430016863406408, "grad_norm": 7.63639577539678, "kl": 0.546875, "learning_rate": 5.146711635750421e-07, "loss": 0.0005, "reward": 3.086588501930237, "reward_std": 0.10986323654651642, "rewards/final_reward": 1.012658648915526, "rewards/mask_iou_reward": 0.506329324457763, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0865885615348816, "rewards/thk_ans_format_reward": 1.0, "step": 1439, "think_completion_length": 39.4375 }, { "clip_ratio": 0.0, "completion_length": 103.328125, "epoch": 2.4317032040472175, "grad_norm": 8.580362862361007, "kl": 0.615234375, "learning_rate": 5.143338954468802e-07, "loss": 0.0006, "reward": 3.609292507171631, "reward_std": 0.2019364982843399, "rewards/final_reward": 1.5419548230177922, "rewards/mask_iou_reward": 0.7709774115088961, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.609292447566986, "rewards/thk_ans_format_reward": 1.0, "step": 1440, "think_completion_length": 34.09375 }, { "clip_ratio": 0.0, "completion_length": 116.578125, "epoch": 2.433389544688027, "grad_norm": 6.399228270220903, "kl": 0.626953125, "learning_rate": 5.139966273187184e-07, "loss": 0.0006, "reward": 3.74202823638916, "reward_std": 0.01405814103782177, "rewards/final_reward": 1.6983815387364867, "rewards/mask_iou_reward": 0.8491907693682433, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7420281767845154, "rewards/thk_ans_format_reward": 1.0, "step": 1441, "think_completion_length": 37.5625 }, { "clip_ratio": 0.0, "completion_length": 106.890625, "epoch": 2.4350758853288363, "grad_norm": 7.7051284301611584, "kl": 0.68359375, "learning_rate": 5.136593591905565e-07, "loss": 0.0007, "reward": 3.153511881828308, "reward_std": 0.1515724379569292, "rewards/final_reward": 1.4864379555430862, "rewards/mask_iou_reward": 0.7432189777715431, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1535118222236633, "rewards/thk_ans_format_reward": 1.0, "step": 1442, "think_completion_length": 37.125 }, { "clip_ratio": 0.0, "completion_length": 106.234375, "epoch": 2.436762225969646, "grad_norm": 5.5674408687940335, "kl": 0.5859375, "learning_rate": 5.133220910623946e-07, "loss": 0.0006, "reward": 3.341616630554199, "reward_std": 0.328810915350914, "rewards/final_reward": 1.717067249798786, "rewards/mask_iou_reward": 0.858533624899393, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3416166305541992, "rewards/thk_ans_format_reward": 1.0, "step": 1443, "think_completion_length": 35.0625 }, { "clip_ratio": 0.0, "completion_length": 121.265625, "epoch": 2.438448566610455, "grad_norm": 6.598616436762747, "kl": 0.529296875, "learning_rate": 5.129848229342328e-07, "loss": 0.0006, "reward": 3.5519330501556396, "reward_std": 0.10587704600766301, "rewards/final_reward": 1.7283281092139209, "rewards/mask_iou_reward": 0.8641640546069604, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.55193293094635, "rewards/thk_ans_format_reward": 1.0, "step": 1444, "think_completion_length": 37.71875 }, { "clip_ratio": 0.0, "completion_length": 119.21875, "epoch": 2.440134907251265, "grad_norm": 9.55776911413741, "kl": 0.552734375, "learning_rate": 5.126475548060709e-07, "loss": 0.0006, "reward": 3.2191598415374756, "reward_std": 0.12895439565181732, "rewards/final_reward": 1.3709724588566616, "rewards/mask_iou_reward": 0.6854862294283308, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2191597819328308, "rewards/thk_ans_format_reward": 1.0, "step": 1445, "think_completion_length": 35.09375 }, { "clip_ratio": 0.0, "completion_length": 108.609375, "epoch": 2.441821247892074, "grad_norm": 7.083086725950559, "kl": 0.65625, "learning_rate": 5.123102866779088e-07, "loss": 0.0007, "reward": 3.472890257835388, "reward_std": 0.09058744460344315, "rewards/final_reward": 1.7449738984243224, "rewards/mask_iou_reward": 0.8724869492121612, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4728901982307434, "rewards/thk_ans_format_reward": 1.0, "step": 1446, "think_completion_length": 37.5625 }, { "clip_ratio": 0.0, "completion_length": 107.125, "epoch": 2.4435075885328836, "grad_norm": 6.919845641855438, "kl": 0.615234375, "learning_rate": 5.11973018549747e-07, "loss": 0.0006, "reward": 3.4161049127578735, "reward_std": 0.2600807845592499, "rewards/final_reward": 1.4440276593637122, "rewards/mask_iou_reward": 0.7220138296818561, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4161049723625183, "rewards/thk_ans_format_reward": 1.0, "step": 1447, "think_completion_length": 36.625 }, { "clip_ratio": 0.0, "completion_length": 109.078125, "epoch": 2.4451939291736933, "grad_norm": 11.199767302410693, "kl": 0.58984375, "learning_rate": 5.116357504215851e-07, "loss": 0.0006, "reward": 3.305394768714905, "reward_std": 0.12580876052379608, "rewards/final_reward": 0.9565860016690916, "rewards/mask_iou_reward": 0.4782930008345458, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3053947687149048, "rewards/thk_ans_format_reward": 1.0, "step": 1448, "think_completion_length": 36.75 }, { "clip_ratio": 0.0, "completion_length": 116.828125, "epoch": 2.4468802698145025, "grad_norm": 7.822717881138574, "kl": 0.55078125, "learning_rate": 5.112984822934232e-07, "loss": 0.0006, "reward": 3.0629160404205322, "reward_std": 0.16106662526726723, "rewards/final_reward": 1.2339833151801627, "rewards/mask_iou_reward": 0.6169916575900813, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.062916100025177, "rewards/thk_ans_format_reward": 1.0, "step": 1449, "think_completion_length": 37.15625 }, { "clip_ratio": 0.0, "completion_length": 122.28125, "epoch": 2.448566610455312, "grad_norm": 7.477339217559345, "kl": 0.5234375, "learning_rate": 5.109612141652614e-07, "loss": 0.0005, "reward": 3.340023159980774, "reward_std": 0.13386711478233337, "rewards/final_reward": 1.5786122566293084, "rewards/mask_iou_reward": 0.7893061283146542, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3400230407714844, "rewards/thk_ans_format_reward": 1.0, "step": 1450, "think_completion_length": 44.5625 }, { "clip_ratio": 0.0, "completion_length": 119.640625, "epoch": 2.4502529510961213, "grad_norm": 20.58704918234177, "kl": 0.5234375, "learning_rate": 5.106239460370995e-07, "loss": 0.0005, "reward": 3.4118305444717407, "reward_std": 0.33906523138284683, "rewards/final_reward": 1.5533910212416537, "rewards/mask_iou_reward": 0.7766955106208269, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4118306040763855, "rewards/thk_ans_format_reward": 1.0, "step": 1451, "think_completion_length": 34.96875 }, { "clip_ratio": 0.0, "completion_length": 130.28125, "epoch": 2.451939291736931, "grad_norm": 7.301675768677225, "kl": 0.8359375, "learning_rate": 5.102866779089376e-07, "loss": 0.0008, "reward": 3.0271406173706055, "reward_std": 0.18788279592990875, "rewards/final_reward": 0.800656352545829, "rewards/mask_iou_reward": 0.4003281762729145, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0271407067775726, "rewards/thk_ans_format_reward": 1.0, "step": 1452, "think_completion_length": 39.8125 }, { "clip_ratio": 0.0, "completion_length": 111.84375, "epoch": 2.45362563237774, "grad_norm": 7.854013631225384, "kl": 0.55078125, "learning_rate": 5.099494097807758e-07, "loss": 0.0005, "reward": 3.3608494997024536, "reward_std": 0.11720556672662497, "rewards/final_reward": 1.8089525754976725, "rewards/mask_iou_reward": 0.9044762877488363, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3608494997024536, "rewards/thk_ans_format_reward": 1.0, "step": 1453, "think_completion_length": 41.96875 }, { "clip_ratio": 0.0, "completion_length": 111.5625, "epoch": 2.4553119730185498, "grad_norm": 10.7742730214391, "kl": 0.578125, "learning_rate": 5.096121416526137e-07, "loss": 0.0006, "reward": 3.0832302570343018, "reward_std": 0.21278557181358337, "rewards/final_reward": 1.304668780640139, "rewards/mask_iou_reward": 0.6523343903200695, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.083230197429657, "rewards/thk_ans_format_reward": 1.0, "step": 1454, "think_completion_length": 35.65625 }, { "clip_ratio": 0.0, "completion_length": 119.84375, "epoch": 2.4569983136593594, "grad_norm": 9.536636881138167, "kl": 0.55859375, "learning_rate": 5.092748735244518e-07, "loss": 0.0006, "reward": 3.2125355005264282, "reward_std": 0.08188419789075851, "rewards/final_reward": 1.5690705940701015, "rewards/mask_iou_reward": 0.7845352970350508, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.212535560131073, "rewards/thk_ans_format_reward": 1.0, "step": 1455, "think_completion_length": 38.25 }, { "clip_ratio": 0.0, "completion_length": 136.703125, "epoch": 2.4586846543001686, "grad_norm": 9.80674586850884, "kl": 0.556640625, "learning_rate": 5.0893760539629e-07, "loss": 0.0006, "reward": 3.300530433654785, "reward_std": 0.08604636648669839, "rewards/final_reward": 1.023525441031889, "rewards/mask_iou_reward": 0.5117627205159445, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3005304336547852, "rewards/thk_ans_format_reward": 1.0, "step": 1456, "think_completion_length": 45.15625 }, { "clip_ratio": 0.0, "completion_length": 113.5, "epoch": 2.460370994940978, "grad_norm": 5.343467075152932, "kl": 0.6171875, "learning_rate": 5.086003372681281e-07, "loss": 0.0006, "reward": 3.5373661518096924, "reward_std": 0.1607318501919508, "rewards/final_reward": 1.1570778227331253, "rewards/mask_iou_reward": 0.5785389113665627, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5373662114143372, "rewards/thk_ans_format_reward": 1.0, "step": 1457, "think_completion_length": 41.65625 }, { "clip_ratio": 0.0, "completion_length": 138.53125, "epoch": 2.4620573355817874, "grad_norm": 5.496156238647655, "kl": 0.546875, "learning_rate": 5.082630691399663e-07, "loss": 0.0005, "reward": 3.036491870880127, "reward_std": 0.05605571623891592, "rewards/final_reward": 1.1909960638484658, "rewards/mask_iou_reward": 0.5954980319242329, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.036491721868515, "rewards/thk_ans_format_reward": 1.0, "step": 1458, "think_completion_length": 38.5 }, { "clip_ratio": 0.0, "completion_length": 108.265625, "epoch": 2.463743676222597, "grad_norm": 4.796046963022677, "kl": 0.666015625, "learning_rate": 5.079258010118044e-07, "loss": 0.0007, "reward": 3.0368528366088867, "reward_std": 0.05668491870164871, "rewards/final_reward": 1.3625609609011828, "rewards/mask_iou_reward": 0.6812804804505914, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.036852777004242, "rewards/thk_ans_format_reward": 1.0, "step": 1459, "think_completion_length": 38.96875 }, { "clip_ratio": 0.0, "completion_length": 170.734375, "epoch": 2.4654300168634062, "grad_norm": 18.43196278631787, "kl": 0.513671875, "learning_rate": 5.075885328836425e-07, "loss": 0.0005, "reward": 3.3323365449905396, "reward_std": 0.29067130386829376, "rewards/final_reward": 1.0533541995261038, "rewards/mask_iou_reward": 0.5266770997630519, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.34796142578125, "rewards/thk_ans_format_reward": 1.0, "step": 1460, "think_completion_length": 37.28125 }, { "clip_ratio": 0.0, "completion_length": 107.375, "epoch": 2.467116357504216, "grad_norm": 27.616315775749797, "kl": 0.705078125, "learning_rate": 5.072512647554807e-07, "loss": 0.0007, "reward": 3.1079777479171753, "reward_std": 0.08986812457442284, "rewards/final_reward": 0.5474214879453962, "rewards/mask_iou_reward": 0.2737107439726981, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.107977718114853, "rewards/thk_ans_format_reward": 1.0, "step": 1461, "think_completion_length": 36.0 }, { "clip_ratio": 0.0, "completion_length": 154.953125, "epoch": 2.4688026981450255, "grad_norm": 9.595931182246488, "kl": 0.52734375, "learning_rate": 5.069139966273187e-07, "loss": 0.0005, "reward": 3.1558161973953247, "reward_std": 0.1381340161897242, "rewards/final_reward": 1.4976028322847301, "rewards/mask_iou_reward": 0.7488014161423651, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1558160781860352, "rewards/thk_ans_format_reward": 1.0, "step": 1462, "think_completion_length": 47.5 }, { "clip_ratio": 0.0, "completion_length": 129.5, "epoch": 2.4704890387858347, "grad_norm": 10.297934092004121, "kl": 0.5390625, "learning_rate": 5.065767284991567e-07, "loss": 0.0005, "reward": 2.827468156814575, "reward_std": 0.3423341289162636, "rewards/final_reward": 0.9297495915607341, "rewards/mask_iou_reward": 0.46487479578036706, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.827468067407608, "rewards/thk_ans_format_reward": 1.0, "step": 1463, "think_completion_length": 38.21875 }, { "clip_ratio": 0.0, "completion_length": 112.40625, "epoch": 2.4721753794266443, "grad_norm": 13.431336757603988, "kl": 0.5859375, "learning_rate": 5.062394603709949e-07, "loss": 0.0006, "reward": 3.1791683435440063, "reward_std": 0.21104427706450224, "rewards/final_reward": 1.347296892498644, "rewards/mask_iou_reward": 0.673648446249322, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1791683435440063, "rewards/thk_ans_format_reward": 1.0, "step": 1464, "think_completion_length": 41.09375 }, { "clip_ratio": 0.0, "completion_length": 132.453125, "epoch": 2.4738617200674535, "grad_norm": 6.4442907881178835, "kl": 0.513671875, "learning_rate": 5.05902192242833e-07, "loss": 0.0005, "reward": 3.7874099016189575, "reward_std": 0.04841741733253002, "rewards/final_reward": 1.7951469981212824, "rewards/mask_iou_reward": 0.8975734990606412, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7874098420143127, "rewards/thk_ans_format_reward": 1.0, "step": 1465, "think_completion_length": 40.6875 }, { "clip_ratio": 0.0, "completion_length": 109.3125, "epoch": 2.475548060708263, "grad_norm": 7.781073503408825, "kl": 0.564453125, "learning_rate": 5.055649241146711e-07, "loss": 0.0006, "reward": 3.5474064350128174, "reward_std": 0.23158020619302988, "rewards/final_reward": 1.626896546940197, "rewards/mask_iou_reward": 0.8134482734700985, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5474063754081726, "rewards/thk_ans_format_reward": 1.0, "step": 1466, "think_completion_length": 40.78125 }, { "clip_ratio": 0.0, "completion_length": 106.53125, "epoch": 2.4772344013490724, "grad_norm": 8.120456104997903, "kl": 0.607421875, "learning_rate": 5.052276559865093e-07, "loss": 0.0006, "reward": 3.499011993408203, "reward_std": 0.10200574016198516, "rewards/final_reward": 1.596909988674616, "rewards/mask_iou_reward": 0.798454994337308, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4990119338035583, "rewards/thk_ans_format_reward": 1.0, "step": 1467, "think_completion_length": 35.875 }, { "clip_ratio": 0.0, "completion_length": 109.609375, "epoch": 2.478920741989882, "grad_norm": 10.586731590590368, "kl": 0.58984375, "learning_rate": 5.048903878583474e-07, "loss": 0.0006, "reward": 3.4600088596343994, "reward_std": 0.19180525839328766, "rewards/final_reward": 1.322128830048752, "rewards/mask_iou_reward": 0.661064415024376, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4600088596343994, "rewards/thk_ans_format_reward": 1.0, "step": 1468, "think_completion_length": 39.28125 }, { "clip_ratio": 0.0, "completion_length": 110.46875, "epoch": 2.4806070826306916, "grad_norm": 7.880859330095419, "kl": 0.5625, "learning_rate": 5.045531197301855e-07, "loss": 0.0006, "reward": 3.185870885848999, "reward_std": 0.20582804456353188, "rewards/final_reward": 1.714308307994534, "rewards/mask_iou_reward": 0.857154153997267, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1858709752559662, "rewards/thk_ans_format_reward": 1.0, "step": 1469, "think_completion_length": 41.46875 }, { "clip_ratio": 0.0, "completion_length": 118.90625, "epoch": 2.482293423271501, "grad_norm": 11.452435006601872, "kl": 0.5546875, "learning_rate": 5.042158516020237e-07, "loss": 0.0006, "reward": 3.056984066963196, "reward_std": 0.2701308634132147, "rewards/final_reward": 1.4174552743281663, "rewards/mask_iou_reward": 0.7087276371640832, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0569840967655182, "rewards/thk_ans_format_reward": 1.0, "step": 1470, "think_completion_length": 35.46875 }, { "clip_ratio": 0.0, "completion_length": 126.765625, "epoch": 2.4839797639123105, "grad_norm": 17.45227197751396, "kl": 0.603515625, "learning_rate": 5.038785834738617e-07, "loss": 0.0006, "reward": 3.5259718894958496, "reward_std": 0.13498846907168627, "rewards/final_reward": 1.3153849615984572, "rewards/mask_iou_reward": 0.6576924807992286, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5259720087051392, "rewards/thk_ans_format_reward": 1.0, "step": 1471, "think_completion_length": 40.0625 }, { "clip_ratio": 0.0, "completion_length": 110.8125, "epoch": 2.4856661045531196, "grad_norm": 5.423812455061472, "kl": 0.71484375, "learning_rate": 5.035413153456997e-07, "loss": 0.0007, "reward": 2.9699904918670654, "reward_std": 0.07269694283604622, "rewards/final_reward": 0.27860076343864043, "rewards/mask_iou_reward": 0.13930038171932022, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9699905514717102, "rewards/thk_ans_format_reward": 1.0, "step": 1472, "think_completion_length": 41.46875 }, { "clip_ratio": 0.0, "completion_length": 107.578125, "epoch": 2.4873524451939293, "grad_norm": 9.20306864688425, "kl": 0.533203125, "learning_rate": 5.032040472175379e-07, "loss": 0.0005, "reward": 3.86440372467041, "reward_std": 0.0190952280536294, "rewards/final_reward": 1.8322273764191868, "rewards/mask_iou_reward": 0.9161136882095934, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8644036650657654, "rewards/thk_ans_format_reward": 1.0, "step": 1473, "think_completion_length": 36.0625 }, { "clip_ratio": 0.0, "completion_length": 115.34375, "epoch": 2.4890387858347385, "grad_norm": 15.323120871459215, "kl": 0.529296875, "learning_rate": 5.02866779089376e-07, "loss": 0.0005, "reward": 2.5336424112319946, "reward_std": 0.3594963401556015, "rewards/final_reward": 0.48737288693574415, "rewards/mask_iou_reward": 0.24368644346787208, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5336422324180603, "rewards/thk_ans_format_reward": 1.0, "step": 1474, "think_completion_length": 44.90625 }, { "clip_ratio": 0.0, "completion_length": 114.390625, "epoch": 2.490725126475548, "grad_norm": 6.752980501040975, "kl": 0.52734375, "learning_rate": 5.025295109612141e-07, "loss": 0.0006, "reward": 3.02541720867157, "reward_std": 0.1034752493724227, "rewards/final_reward": 1.149960956271575, "rewards/mask_iou_reward": 0.5749804781357875, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0254172086715698, "rewards/thk_ans_format_reward": 1.0, "step": 1475, "think_completion_length": 41.8125 }, { "clip_ratio": 0.0, "completion_length": 166.390625, "epoch": 2.4924114671163577, "grad_norm": 222.0518474690732, "kl": 0.4921875, "learning_rate": 5.021922428330523e-07, "loss": 0.0005, "reward": 3.418384552001953, "reward_std": 0.10731749702244997, "rewards/final_reward": 1.6023031364795055, "rewards/mask_iou_reward": 0.8011515682397528, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4183844923973083, "rewards/thk_ans_format_reward": 1.0, "step": 1476, "think_completion_length": 42.71875 }, { "clip_ratio": 0.0, "completion_length": 111.265625, "epoch": 2.494097807757167, "grad_norm": 7.146648683338561, "kl": 0.609375, "learning_rate": 5.018549747048904e-07, "loss": 0.0006, "reward": 3.5869948863983154, "reward_std": 0.0974464938044548, "rewards/final_reward": 1.3416405430706282, "rewards/mask_iou_reward": 0.6708202715353141, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5869948267936707, "rewards/thk_ans_format_reward": 1.0, "step": 1477, "think_completion_length": 41.03125 }, { "clip_ratio": 0.0, "completion_length": 125.5, "epoch": 2.4957841483979766, "grad_norm": 5.366598932763867, "kl": 0.4912109375, "learning_rate": 5.015177065767285e-07, "loss": 0.0005, "reward": 3.449162483215332, "reward_std": 0.04182947881054133, "rewards/final_reward": 1.195586466228961, "rewards/mask_iou_reward": 0.5977932331144805, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4491624236106873, "rewards/thk_ans_format_reward": 1.0, "step": 1478, "think_completion_length": 43.46875 }, { "clip_ratio": 0.0, "completion_length": 101.796875, "epoch": 2.4974704890387858, "grad_norm": 5.736300280187392, "kl": 0.62109375, "learning_rate": 5.011804384485666e-07, "loss": 0.0006, "reward": 3.5285149812698364, "reward_std": 0.28720738738775253, "rewards/final_reward": 1.6930366481005783, "rewards/mask_iou_reward": 0.8465183240502892, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5285149812698364, "rewards/thk_ans_format_reward": 1.0, "step": 1479, "think_completion_length": 34.8125 }, { "clip_ratio": 0.0, "completion_length": 119.5625, "epoch": 2.4991568296795954, "grad_norm": 8.24736762704496, "kl": 0.5224609375, "learning_rate": 5.008431703204047e-07, "loss": 0.0005, "reward": 3.488741397857666, "reward_std": 0.25808994472026825, "rewards/final_reward": 1.59323068836111, "rewards/mask_iou_reward": 0.796615344180555, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4887413382530212, "rewards/thk_ans_format_reward": 1.0, "step": 1480, "think_completion_length": 35.59375 }, { "clip_ratio": 0.0, "completion_length": 108.6875, "epoch": 2.5008431703204046, "grad_norm": 11.027305372617139, "kl": 0.50390625, "learning_rate": 5.005059021922427e-07, "loss": 0.0005, "reward": 3.311052680015564, "reward_std": 0.36544879525899887, "rewards/final_reward": 1.7231988014742277, "rewards/mask_iou_reward": 0.8615994007371138, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3110525608062744, "rewards/thk_ans_format_reward": 1.0, "step": 1481, "think_completion_length": 42.15625 }, { "clip_ratio": 0.0, "completion_length": 109.65625, "epoch": 2.5025295109612142, "grad_norm": 11.319671035463822, "kl": 0.626953125, "learning_rate": 5.001686340640809e-07, "loss": 0.0006, "reward": 3.3623111248016357, "reward_std": 0.09125454165041447, "rewards/final_reward": 1.0992310112762569, "rewards/mask_iou_reward": 0.5496155056381284, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3623109459877014, "rewards/thk_ans_format_reward": 1.0, "step": 1482, "think_completion_length": 41.65625 }, { "clip_ratio": 0.0, "completion_length": 172.5, "epoch": 2.504215851602024, "grad_norm": 6.014732577915013, "kl": 0.4951171875, "learning_rate": 4.99831365935919e-07, "loss": 0.0005, "reward": 3.3078333139419556, "reward_std": 0.4521195776760578, "rewards/final_reward": 1.6163931838358212, "rewards/mask_iou_reward": 0.8081965919179106, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 1.3703332543373108, "rewards/thk_ans_format_reward": 0.96875, "step": 1483, "think_completion_length": 37.3125 }, { "clip_ratio": 0.0, "completion_length": 110.625, "epoch": 2.505902192242833, "grad_norm": 8.711154969838693, "kl": 0.57421875, "learning_rate": 4.994940978077571e-07, "loss": 0.0006, "reward": 3.4808534383773804, "reward_std": 0.10041437298059464, "rewards/final_reward": 1.5617287270725693, "rewards/mask_iou_reward": 0.7808643635362846, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4808533787727356, "rewards/thk_ans_format_reward": 1.0, "step": 1484, "think_completion_length": 33.21875 }, { "clip_ratio": 0.0, "completion_length": 125.921875, "epoch": 2.5075885328836423, "grad_norm": 6.4521826255397885, "kl": 0.6279296875, "learning_rate": 4.991568296795953e-07, "loss": 0.0006, "reward": 3.675198793411255, "reward_std": 0.23655812442302704, "rewards/final_reward": 1.8064039526925049, "rewards/mask_iou_reward": 0.9032019763462524, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6751986742019653, "rewards/thk_ans_format_reward": 1.0, "step": 1485, "think_completion_length": 41.125 }, { "clip_ratio": 0.0, "completion_length": 105.109375, "epoch": 2.509274873524452, "grad_norm": 9.535230698250063, "kl": 0.544921875, "learning_rate": 4.988195615514334e-07, "loss": 0.0005, "reward": 3.2731704711914062, "reward_std": 0.18547899648547173, "rewards/final_reward": 1.2086966743957175, "rewards/mask_iou_reward": 0.6043483371978587, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2731704115867615, "rewards/thk_ans_format_reward": 1.0, "step": 1486, "think_completion_length": 35.03125 }, { "clip_ratio": 0.0, "completion_length": 109.640625, "epoch": 2.5109612141652615, "grad_norm": 12.694000418125398, "kl": 0.5703125, "learning_rate": 4.984822934232715e-07, "loss": 0.0006, "reward": 2.720350742340088, "reward_std": 0.07947659306228161, "rewards/final_reward": 1.0917999045996127, "rewards/mask_iou_reward": 0.5458999522998064, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7203506827354431, "rewards/thk_ans_format_reward": 1.0, "step": 1487, "think_completion_length": 41.9375 }, { "clip_ratio": 0.0, "completion_length": 104.96875, "epoch": 2.5126475548060707, "grad_norm": 10.410175507358746, "kl": 0.58203125, "learning_rate": 4.981450252951096e-07, "loss": 0.0006, "reward": 3.588701009750366, "reward_std": 0.03587500285357237, "rewards/final_reward": 1.8622233827402295, "rewards/mask_iou_reward": 0.9311116913701147, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5887010097503662, "rewards/thk_ans_format_reward": 1.0, "step": 1488, "think_completion_length": 36.0 }, { "clip_ratio": 0.0, "completion_length": 106.625, "epoch": 2.5143338954468804, "grad_norm": 7.451733615106546, "kl": 0.56640625, "learning_rate": 4.978077571669478e-07, "loss": 0.0006, "reward": 3.5282169580459595, "reward_std": 0.10562526807188988, "rewards/final_reward": 1.3080303513620586, "rewards/mask_iou_reward": 0.6540151756810293, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.528217077255249, "rewards/thk_ans_format_reward": 1.0, "step": 1489, "think_completion_length": 40.40625 }, { "clip_ratio": 0.0, "completion_length": 219.34375, "epoch": 2.51602023608769, "grad_norm": 19.399719466274046, "kl": 0.498046875, "learning_rate": 4.974704890387858e-07, "loss": 0.0005, "reward": 3.088352680206299, "reward_std": 0.4080119878053665, "rewards/final_reward": 1.228325888967735, "rewards/mask_iou_reward": 0.6141629444838675, "rewards/sam_format_reward": 0.921875, "rewards/sam_reward_func_ultra": 1.2446027398109436, "rewards/thk_ans_format_reward": 0.921875, "step": 1490, "think_completion_length": 36.1875 }, { "clip_ratio": 0.0, "completion_length": 105.96875, "epoch": 2.517706576728499, "grad_norm": 7.137551160868289, "kl": 2.7939453125, "learning_rate": 4.971332209106239e-07, "loss": 0.0028, "reward": 3.375158429145813, "reward_std": 0.16005902830511332, "rewards/final_reward": 1.8587552642109353, "rewards/mask_iou_reward": 0.9293776321054676, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.375158429145813, "rewards/thk_ans_format_reward": 1.0, "step": 1491, "think_completion_length": 35.0625 }, { "clip_ratio": 0.0, "completion_length": 123.5, "epoch": 2.5193929173693084, "grad_norm": 8.313529358750277, "kl": 0.46484375, "learning_rate": 4.96795952782462e-07, "loss": 0.0005, "reward": 3.03733229637146, "reward_std": 0.10004133731126785, "rewards/final_reward": 1.4227331040090978, "rewards/mask_iou_reward": 0.7113665520045489, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0373322367668152, "rewards/thk_ans_format_reward": 1.0, "step": 1492, "think_completion_length": 37.0 }, { "clip_ratio": 0.0, "completion_length": 116.296875, "epoch": 2.521079258010118, "grad_norm": 13.336282053939824, "kl": 0.54296875, "learning_rate": 4.964586846543001e-07, "loss": 0.0005, "reward": 3.2911131381988525, "reward_std": 0.08773962408304214, "rewards/final_reward": 0.6487437595690562, "rewards/mask_iou_reward": 0.3243718797845281, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2911131978034973, "rewards/thk_ans_format_reward": 1.0, "step": 1493, "think_completion_length": 35.78125 }, { "clip_ratio": 0.0, "completion_length": 128.28125, "epoch": 2.5227655986509276, "grad_norm": 37.03975936055251, "kl": 0.5, "learning_rate": 4.961214165261383e-07, "loss": 0.0005, "reward": 3.58980131149292, "reward_std": 0.0956022769678384, "rewards/final_reward": 1.7125394411309012, "rewards/mask_iou_reward": 0.8562697205654506, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.58980131149292, "rewards/thk_ans_format_reward": 1.0, "step": 1494, "think_completion_length": 40.0 }, { "clip_ratio": 0.0, "completion_length": 107.640625, "epoch": 2.524451939291737, "grad_norm": 5.045301472235186, "kl": 0.56640625, "learning_rate": 4.957841483979764e-07, "loss": 0.0006, "reward": 2.850724458694458, "reward_std": 0.1821054145693779, "rewards/final_reward": 0.9959888150127159, "rewards/mask_iou_reward": 0.49799440750635793, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8507243692874908, "rewards/thk_ans_format_reward": 1.0, "step": 1495, "think_completion_length": 37.6875 }, { "clip_ratio": 0.0, "completion_length": 110.625, "epoch": 2.5261382799325465, "grad_norm": 14.382459957277339, "kl": 0.5625, "learning_rate": 4.954468802698145e-07, "loss": 0.0006, "reward": 2.9980448484420776, "reward_std": 0.07024937309324741, "rewards/final_reward": 1.1191838693528742, "rewards/mask_iou_reward": 0.5595919346764371, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9980448931455612, "rewards/thk_ans_format_reward": 1.0, "step": 1496, "think_completion_length": 38.28125 }, { "clip_ratio": 0.0, "completion_length": 130.359375, "epoch": 2.5278246205733557, "grad_norm": 7.46836121492322, "kl": 0.513671875, "learning_rate": 4.951096121416526e-07, "loss": 0.0005, "reward": 3.2769733667373657, "reward_std": 0.15931928902864456, "rewards/final_reward": 1.3874132598866995, "rewards/mask_iou_reward": 0.6937066299433498, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2769732475280762, "rewards/thk_ans_format_reward": 1.0, "step": 1497, "think_completion_length": 39.0 }, { "clip_ratio": 0.0, "completion_length": 150.296875, "epoch": 2.5295109612141653, "grad_norm": 11.852359868027275, "kl": 0.51953125, "learning_rate": 4.947723440134908e-07, "loss": 0.0005, "reward": 3.484477162361145, "reward_std": 0.2647605128586292, "rewards/final_reward": 1.5508365032119544, "rewards/mask_iou_reward": 0.7754182516059772, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.500102162361145, "rewards/thk_ans_format_reward": 1.0, "step": 1498, "think_completion_length": 40.5625 }, { "clip_ratio": 0.0, "completion_length": 107.546875, "epoch": 2.5311973018549745, "grad_norm": 7.236240225814442, "kl": 1.712890625, "learning_rate": 4.944350758853287e-07, "loss": 0.0017, "reward": 3.281991481781006, "reward_std": 0.09113920107483864, "rewards/final_reward": 1.6733748077233552, "rewards/mask_iou_reward": 0.8366874038616776, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2819915413856506, "rewards/thk_ans_format_reward": 1.0, "step": 1499, "think_completion_length": 35.71875 }, { "clip_ratio": 0.0, "completion_length": 111.953125, "epoch": 2.532883642495784, "grad_norm": 5.106811671021872, "kl": 0.529296875, "learning_rate": 4.940978077571669e-07, "loss": 0.0005, "reward": 3.3747832775115967, "reward_std": 0.03719430975615978, "rewards/final_reward": 1.7466277158454178, "rewards/mask_iou_reward": 0.8733138579227089, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3747830986976624, "rewards/thk_ans_format_reward": 1.0, "step": 1500, "think_completion_length": 38.40625 }, { "clip_ratio": 0.0, "completion_length": 109.859375, "epoch": 2.5345699831365938, "grad_norm": 8.000033394139844, "kl": 0.5205078125, "learning_rate": 4.93760539629005e-07, "loss": 0.0005, "reward": 3.3949146270751953, "reward_std": 0.06679772771894932, "rewards/final_reward": 1.4510022893249355, "rewards/mask_iou_reward": 0.7255011446624677, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3949146270751953, "rewards/thk_ans_format_reward": 1.0, "step": 1501, "think_completion_length": 38.46875 }, { "clip_ratio": 0.0, "completion_length": 108.625, "epoch": 2.536256323777403, "grad_norm": 27.011764406123124, "kl": 0.55859375, "learning_rate": 4.934232715008432e-07, "loss": 0.0006, "reward": 3.6643035411834717, "reward_std": 0.10874908417463303, "rewards/final_reward": 1.6199177468343642, "rewards/mask_iou_reward": 0.8099588734171821, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6643034219741821, "rewards/thk_ans_format_reward": 1.0, "step": 1502, "think_completion_length": 40.59375 }, { "clip_ratio": 0.0, "completion_length": 114.015625, "epoch": 2.5379426644182126, "grad_norm": 11.198242643547353, "kl": 0.54296875, "learning_rate": 4.930860033726813e-07, "loss": 0.0005, "reward": 3.4217268228530884, "reward_std": 0.14082890190184116, "rewards/final_reward": 1.3744126428382355, "rewards/mask_iou_reward": 0.6872063214191177, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4217267632484436, "rewards/thk_ans_format_reward": 1.0, "step": 1503, "think_completion_length": 41.75 }, { "clip_ratio": 0.0, "completion_length": 152.921875, "epoch": 2.539629005059022, "grad_norm": 6.024812195755251, "kl": 1.859375, "learning_rate": 4.927487352445194e-07, "loss": 0.0019, "reward": 3.6555745601654053, "reward_std": 0.021429577842354774, "rewards/final_reward": 1.4500117782519568, "rewards/mask_iou_reward": 0.7250058891259784, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6555745005607605, "rewards/thk_ans_format_reward": 1.0, "step": 1504, "think_completion_length": 40.6875 }, { "clip_ratio": 0.0, "completion_length": 124.796875, "epoch": 2.5413153456998314, "grad_norm": 4.938521081033098, "kl": 0.560546875, "learning_rate": 4.924114671163575e-07, "loss": 0.0006, "reward": 3.088452100753784, "reward_std": 0.11477963626384735, "rewards/final_reward": 1.2085818779067128, "rewards/mask_iou_reward": 0.6042909389533564, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0884520709514618, "rewards/thk_ans_format_reward": 1.0, "step": 1505, "think_completion_length": 37.75 }, { "clip_ratio": 0.0, "completion_length": 125.5625, "epoch": 2.5430016863406406, "grad_norm": 23.17355611185932, "kl": 0.5234375, "learning_rate": 4.920741989881956e-07, "loss": 0.0005, "reward": 3.434250831604004, "reward_std": 0.0526156984269619, "rewards/final_reward": 1.5713436340782292, "rewards/mask_iou_reward": 0.7856718170391146, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4342508912086487, "rewards/thk_ans_format_reward": 1.0, "step": 1506, "think_completion_length": 42.40625 }, { "clip_ratio": 0.0, "completion_length": 146.625, "epoch": 2.5446880269814502, "grad_norm": 4.824502597592859, "kl": 0.431640625, "learning_rate": 4.917369308600338e-07, "loss": 0.0004, "reward": 3.3531363010406494, "reward_std": 0.09069814160466194, "rewards/final_reward": 1.1147132374340087, "rewards/mask_iou_reward": 0.5573566187170044, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3531363010406494, "rewards/thk_ans_format_reward": 1.0, "step": 1507, "think_completion_length": 47.15625 }, { "clip_ratio": 0.0, "completion_length": 113.515625, "epoch": 2.54637436762226, "grad_norm": 9.57819569491857, "kl": 0.546875, "learning_rate": 4.913996627318718e-07, "loss": 0.0006, "reward": 3.4678783416748047, "reward_std": 0.16838806122541428, "rewards/final_reward": 1.2330647677428894, "rewards/mask_iou_reward": 0.6165323838714447, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4678783416748047, "rewards/thk_ans_format_reward": 1.0, "step": 1508, "think_completion_length": 37.78125 }, { "clip_ratio": 0.0, "completion_length": 112.796875, "epoch": 2.548060708263069, "grad_norm": 5.676967471760433, "kl": 0.533203125, "learning_rate": 4.910623946037099e-07, "loss": 0.0006, "reward": 3.423642158508301, "reward_std": 0.09652687440393493, "rewards/final_reward": 1.5992528204709278, "rewards/mask_iou_reward": 0.7996264102354639, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4236420392990112, "rewards/thk_ans_format_reward": 1.0, "step": 1509, "think_completion_length": 40.3125 }, { "clip_ratio": 0.0, "completion_length": 122.890625, "epoch": 2.5497470489038787, "grad_norm": 7.789483168092156, "kl": 0.513671875, "learning_rate": 4.90725126475548e-07, "loss": 0.0005, "reward": 2.3797736167907715, "reward_std": 0.11256012320518494, "rewards/final_reward": 0.4328186320145556, "rewards/mask_iou_reward": 0.2164093160072778, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.3797735422849655, "rewards/thk_ans_format_reward": 1.0, "step": 1510, "think_completion_length": 42.09375 }, { "clip_ratio": 0.0, "completion_length": 108.03125, "epoch": 2.551433389544688, "grad_norm": 9.713157031940664, "kl": 0.544921875, "learning_rate": 4.903878583473862e-07, "loss": 0.0005, "reward": 3.414551854133606, "reward_std": 0.11701996996998787, "rewards/final_reward": 1.3789813631599346, "rewards/mask_iou_reward": 0.6894906815799673, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4145516157150269, "rewards/thk_ans_format_reward": 1.0, "step": 1511, "think_completion_length": 40.0625 }, { "clip_ratio": 0.0, "completion_length": 136.28125, "epoch": 2.5531197301854975, "grad_norm": 5.514874996478179, "kl": 0.58203125, "learning_rate": 4.900505902192242e-07, "loss": 0.0006, "reward": 3.4243216514587402, "reward_std": 0.0784766897559166, "rewards/final_reward": 1.890977303160391, "rewards/mask_iou_reward": 0.9454886515801955, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4243216514587402, "rewards/thk_ans_format_reward": 1.0, "step": 1512, "think_completion_length": 42.09375 }, { "clip_ratio": 0.0, "completion_length": 142.234375, "epoch": 2.5548060708263067, "grad_norm": 8.96725315369348, "kl": 0.513671875, "learning_rate": 4.897133220910624e-07, "loss": 0.0005, "reward": 2.880871295928955, "reward_std": 0.15390251949429512, "rewards/final_reward": 1.1972393384262139, "rewards/mask_iou_reward": 0.5986196692131069, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8808711767196655, "rewards/thk_ans_format_reward": 1.0, "step": 1513, "think_completion_length": 42.03125 }, { "clip_ratio": 0.0, "completion_length": 116.71875, "epoch": 2.5564924114671164, "grad_norm": 27.2516380009568, "kl": 0.517578125, "learning_rate": 4.893760539629005e-07, "loss": 0.0005, "reward": 3.5386931896209717, "reward_std": 0.22735736519098282, "rewards/final_reward": 1.276440365439687, "rewards/mask_iou_reward": 0.6382201827198435, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.538693130016327, "rewards/thk_ans_format_reward": 1.0, "step": 1514, "think_completion_length": 48.4375 }, { "clip_ratio": 0.0, "completion_length": 147.671875, "epoch": 2.558178752107926, "grad_norm": 5.983763326608007, "kl": 0.529296875, "learning_rate": 4.890387858347387e-07, "loss": 0.0005, "reward": 3.8019983768463135, "reward_std": 0.02921892609447241, "rewards/final_reward": 1.868028471346339, "rewards/mask_iou_reward": 0.9340142356731695, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8019983768463135, "rewards/thk_ans_format_reward": 1.0, "step": 1515, "think_completion_length": 38.53125 }, { "clip_ratio": 0.0, "completion_length": 185.34375, "epoch": 2.559865092748735, "grad_norm": 4.5608997779574585, "kl": 0.48046875, "learning_rate": 4.887015177065766e-07, "loss": 0.0005, "reward": 2.8760156631469727, "reward_std": 0.414703406393528, "rewards/final_reward": 0.7626393288297652, "rewards/mask_iou_reward": 0.3813196644148826, "rewards/sam_format_reward": 0.9375, "rewards/sam_reward_func_ultra": 1.0010156631469727, "rewards/thk_ans_format_reward": 0.9375, "step": 1516, "think_completion_length": 47.625 }, { "clip_ratio": 0.0, "completion_length": 112.015625, "epoch": 2.561551433389545, "grad_norm": 9.563988407014383, "kl": 0.494140625, "learning_rate": 4.883642495784148e-07, "loss": 0.0005, "reward": 3.6301556825637817, "reward_std": 0.5168076306581497, "rewards/final_reward": 1.6550659131921477, "rewards/mask_iou_reward": 0.8275329565960738, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.630155622959137, "rewards/thk_ans_format_reward": 1.0, "step": 1517, "think_completion_length": 49.96875 }, { "clip_ratio": 0.0, "completion_length": 196.515625, "epoch": 2.563237774030354, "grad_norm": 6.757332895940087, "kl": 0.4267578125, "learning_rate": 4.880269814502529e-07, "loss": 0.0004, "reward": 3.1780283451080322, "reward_std": 0.18703092634677887, "rewards/final_reward": 1.1283806928614566, "rewards/mask_iou_reward": 0.5641903464307283, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1780283451080322, "rewards/thk_ans_format_reward": 1.0, "step": 1518, "think_completion_length": 41.34375 }, { "clip_ratio": 0.0, "completion_length": 122.140625, "epoch": 2.5649241146711637, "grad_norm": 6.581563060182763, "kl": 0.841796875, "learning_rate": 4.87689713322091e-07, "loss": 0.0008, "reward": 3.018182158470154, "reward_std": 0.04332828428596258, "rewards/final_reward": 1.3776301205739845, "rewards/mask_iou_reward": 0.6888150602869922, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.018182247877121, "rewards/thk_ans_format_reward": 1.0, "step": 1519, "think_completion_length": 41.84375 }, { "clip_ratio": 0.0, "completion_length": 147.21875, "epoch": 2.566610455311973, "grad_norm": 16.043120163718434, "kl": 0.482421875, "learning_rate": 4.873524451939291e-07, "loss": 0.0005, "reward": 3.086033821105957, "reward_std": 0.15922314673662186, "rewards/final_reward": 0.9320970069593832, "rewards/mask_iou_reward": 0.4660485034796916, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.086033821105957, "rewards/thk_ans_format_reward": 1.0, "step": 1520, "think_completion_length": 45.34375 }, { "clip_ratio": 0.0, "completion_length": 125.71875, "epoch": 2.5682967959527825, "grad_norm": 12.811754975192569, "kl": 0.568359375, "learning_rate": 4.870151770657673e-07, "loss": 0.0006, "reward": 3.313421607017517, "reward_std": 0.12244333326816559, "rewards/final_reward": 1.017356395968409, "rewards/mask_iou_reward": 0.5086781979842045, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3134216666221619, "rewards/thk_ans_format_reward": 1.0, "step": 1521, "think_completion_length": 43.125 }, { "clip_ratio": 0.0, "completion_length": 116.015625, "epoch": 2.569983136593592, "grad_norm": 6.3633128568114, "kl": 0.580078125, "learning_rate": 4.866779089376054e-07, "loss": 0.0006, "reward": 3.3072917461395264, "reward_std": 0.11184610333293676, "rewards/final_reward": 1.2002857379851997, "rewards/mask_iou_reward": 0.6001428689925998, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.307291865348816, "rewards/thk_ans_format_reward": 1.0, "step": 1522, "think_completion_length": 51.78125 }, { "clip_ratio": 0.0, "completion_length": 132.359375, "epoch": 2.5716694772344013, "grad_norm": 20.475153625295707, "kl": 0.458984375, "learning_rate": 4.863406408094435e-07, "loss": 0.0004, "reward": 3.3745312690734863, "reward_std": 0.2294555902481079, "rewards/final_reward": 1.0695021371042701, "rewards/mask_iou_reward": 0.5347510685521351, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.374531239271164, "rewards/thk_ans_format_reward": 1.0, "step": 1523, "think_completion_length": 44.59375 }, { "clip_ratio": 0.0, "completion_length": 177.390625, "epoch": 2.573355817875211, "grad_norm": 6.926455621132243, "kl": 0.4404296875, "learning_rate": 4.860033726812816e-07, "loss": 0.0004, "reward": 3.157252073287964, "reward_std": 0.29762740433216095, "rewards/final_reward": 0.802155203549531, "rewards/mask_iou_reward": 0.4010776017747655, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1572520732879639, "rewards/thk_ans_format_reward": 1.0, "step": 1524, "think_completion_length": 46.28125 }, { "clip_ratio": 0.0, "completion_length": 145.765625, "epoch": 2.57504215851602, "grad_norm": 5.606031237944716, "kl": 0.47265625, "learning_rate": 4.856661045531196e-07, "loss": 0.0005, "reward": 3.2091275453567505, "reward_std": 0.10631818510591984, "rewards/final_reward": 1.401239787273532, "rewards/mask_iou_reward": 0.700619893636766, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2091275751590729, "rewards/thk_ans_format_reward": 1.0, "step": 1525, "think_completion_length": 44.1875 }, { "clip_ratio": 0.0, "completion_length": 181.0625, "epoch": 2.5767284991568298, "grad_norm": 18.99856368894485, "kl": 0.4482421875, "learning_rate": 4.853288364249578e-07, "loss": 0.0004, "reward": 3.006584405899048, "reward_std": 0.3390379399061203, "rewards/final_reward": 0.8887834962915174, "rewards/mask_iou_reward": 0.4443917481457587, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0065844655036926, "rewards/thk_ans_format_reward": 1.0, "step": 1526, "think_completion_length": 46.71875 }, { "clip_ratio": 0.0, "completion_length": 113.46875, "epoch": 2.578414839797639, "grad_norm": 10.312057039994464, "kl": 0.5625, "learning_rate": 4.849915682967959e-07, "loss": 0.0006, "reward": 3.5264012813568115, "reward_std": 0.06957072392106056, "rewards/final_reward": 1.8601035590132144, "rewards/mask_iou_reward": 0.9300517795066072, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5264012813568115, "rewards/thk_ans_format_reward": 1.0, "step": 1527, "think_completion_length": 41.15625 }, { "clip_ratio": 0.0, "completion_length": 137.3125, "epoch": 2.5801011804384486, "grad_norm": 6.768274528540181, "kl": 0.576171875, "learning_rate": 4.846543001686341e-07, "loss": 0.0006, "reward": 3.2123255729675293, "reward_std": 0.16551712527871132, "rewards/final_reward": 1.4633281006480252, "rewards/mask_iou_reward": 0.7316640503240126, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2123255133628845, "rewards/thk_ans_format_reward": 1.0, "step": 1528, "think_completion_length": 47.71875 }, { "clip_ratio": 0.0, "completion_length": 97.59375, "epoch": 2.5817875210792582, "grad_norm": 4.944611934648343, "kl": 0.583984375, "learning_rate": 4.843170320404721e-07, "loss": 0.0006, "reward": 3.326672673225403, "reward_std": 0.0943008842295967, "rewards/final_reward": 0.8625031017373519, "rewards/mask_iou_reward": 0.43125155086867595, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3266727328300476, "rewards/thk_ans_format_reward": 1.0, "step": 1529, "think_completion_length": 45.84375 }, { "clip_ratio": 0.0, "completion_length": 169.5, "epoch": 2.5834738617200674, "grad_norm": 7.237087579186458, "kl": 0.484375, "learning_rate": 4.839797639123103e-07, "loss": 0.0005, "reward": 3.3302892446517944, "reward_std": 0.2636425644159317, "rewards/final_reward": 1.2618188810091784, "rewards/mask_iou_reward": 0.6309094405045892, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3302891850471497, "rewards/thk_ans_format_reward": 1.0, "step": 1530, "think_completion_length": 46.34375 }, { "clip_ratio": 0.0, "completion_length": 177.765625, "epoch": 2.5851602023608766, "grad_norm": 12.82926820626442, "kl": 0.4873046875, "learning_rate": 4.836424957841484e-07, "loss": 0.0005, "reward": 3.2265427112579346, "reward_std": 0.19642452150583267, "rewards/final_reward": 1.2173636238875956, "rewards/mask_iou_reward": 0.6086818119437978, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2265428006649017, "rewards/thk_ans_format_reward": 1.0, "step": 1531, "think_completion_length": 47.0625 }, { "clip_ratio": 0.0, "completion_length": 120.015625, "epoch": 2.5868465430016863, "grad_norm": 10.453234101933612, "kl": 0.51953125, "learning_rate": 4.833052276559865e-07, "loss": 0.0005, "reward": 2.6653435230255127, "reward_std": 0.0757587868720293, "rewards/final_reward": 1.1625929683638556, "rewards/mask_iou_reward": 0.5812964841819278, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6653436124324799, "rewards/thk_ans_format_reward": 1.0, "step": 1532, "think_completion_length": 42.5 }, { "clip_ratio": 0.0, "completion_length": 156.328125, "epoch": 2.588532883642496, "grad_norm": 6.342628822025022, "kl": 0.556640625, "learning_rate": 4.829679595278246e-07, "loss": 0.0006, "reward": 3.794961452484131, "reward_std": 0.13813296146690845, "rewards/final_reward": 1.9120891559917732, "rewards/mask_iou_reward": 0.9560445779958866, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.794961392879486, "rewards/thk_ans_format_reward": 1.0, "step": 1533, "think_completion_length": 46.0 }, { "clip_ratio": 0.0, "completion_length": 111.578125, "epoch": 2.590219224283305, "grad_norm": 18.23118607933623, "kl": 0.515625, "learning_rate": 4.826306913996627e-07, "loss": 0.0005, "reward": 2.9826323986053467, "reward_std": 0.03328784089535475, "rewards/final_reward": 0.8880378820892472, "rewards/mask_iou_reward": 0.4440189410446236, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9826323986053467, "rewards/thk_ans_format_reward": 1.0, "step": 1534, "think_completion_length": 42.5 }, { "clip_ratio": 0.0, "completion_length": 161.09375, "epoch": 2.5919055649241147, "grad_norm": 8.61276484430537, "kl": 0.5537109375, "learning_rate": 4.822934232715008e-07, "loss": 0.0006, "reward": 3.336413264274597, "reward_std": 0.17770569026470184, "rewards/final_reward": 1.638049519197422, "rewards/mask_iou_reward": 0.819024759598711, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3364134430885315, "rewards/thk_ans_format_reward": 1.0, "step": 1535, "think_completion_length": 46.09375 }, { "clip_ratio": 0.0, "completion_length": 112.140625, "epoch": 2.5935919055649244, "grad_norm": 5.034058915635746, "kl": 0.533203125, "learning_rate": 4.819561551433389e-07, "loss": 0.0005, "reward": 3.8958654403686523, "reward_std": 0.021186801604926586, "rewards/final_reward": 1.8879820680199333, "rewards/mask_iou_reward": 0.9439910340099666, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.895865261554718, "rewards/thk_ans_format_reward": 1.0, "step": 1536, "think_completion_length": 43.1875 }, { "clip_ratio": 0.0, "completion_length": 120.234375, "epoch": 2.5952782462057336, "grad_norm": 12.630529991131136, "kl": 0.55859375, "learning_rate": 4.81618887015177e-07, "loss": 0.0006, "reward": 3.3030004501342773, "reward_std": 0.10049432516098022, "rewards/final_reward": 1.3787357369782953, "rewards/mask_iou_reward": 0.6893678684891477, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.303000271320343, "rewards/thk_ans_format_reward": 1.0, "step": 1537, "think_completion_length": 45.90625 }, { "clip_ratio": 0.0, "completion_length": 119.359375, "epoch": 2.5969645868465427, "grad_norm": 13.931755040748557, "kl": 0.49609375, "learning_rate": 4.812816188870151e-07, "loss": 0.0005, "reward": 3.333008289337158, "reward_std": 0.15168678015470505, "rewards/final_reward": 1.42218525400646, "rewards/mask_iou_reward": 0.71109262700323, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3330082297325134, "rewards/thk_ans_format_reward": 1.0, "step": 1538, "think_completion_length": 45.875 }, { "clip_ratio": 0.0, "completion_length": 134.0625, "epoch": 2.5986509274873524, "grad_norm": 5.2649389912705375, "kl": 0.5078125, "learning_rate": 4.809443507588533e-07, "loss": 0.0005, "reward": 2.6697299480438232, "reward_std": 0.0926759373396635, "rewards/final_reward": 0.5165016829960553, "rewards/mask_iou_reward": 0.25825084149802763, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6697298586368561, "rewards/thk_ans_format_reward": 1.0, "step": 1539, "think_completion_length": 47.5 }, { "clip_ratio": 0.0, "completion_length": 147.328125, "epoch": 2.600337268128162, "grad_norm": 21.037030317936612, "kl": 0.55859375, "learning_rate": 4.806070826306914e-07, "loss": 0.0006, "reward": 3.2079389095306396, "reward_std": 0.12378528714179993, "rewards/final_reward": 1.4359067824635598, "rewards/mask_iou_reward": 0.7179533912317799, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2079389095306396, "rewards/thk_ans_format_reward": 1.0, "step": 1540, "think_completion_length": 43.0 }, { "clip_ratio": 0.0, "completion_length": 157.59375, "epoch": 2.602023608768971, "grad_norm": 10.794107348698743, "kl": 0.470703125, "learning_rate": 4.802698145025295e-07, "loss": 0.0005, "reward": 3.417840003967285, "reward_std": 0.06528156064450741, "rewards/final_reward": 1.9286261272195278, "rewards/mask_iou_reward": 0.9643130636097639, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4178398847579956, "rewards/thk_ans_format_reward": 1.0, "step": 1541, "think_completion_length": 46.0625 }, { "clip_ratio": 0.0, "completion_length": 132.15625, "epoch": 2.603709949409781, "grad_norm": 7.865836908975941, "kl": 0.564453125, "learning_rate": 4.799325463743676e-07, "loss": 0.0007, "reward": 3.137349486351013, "reward_std": 0.04046285804361105, "rewards/final_reward": 0.8131963840540475, "rewards/mask_iou_reward": 0.40659819202702374, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1373494267463684, "rewards/thk_ans_format_reward": 1.0, "step": 1542, "think_completion_length": 49.625 }, { "clip_ratio": 0.0, "completion_length": 113.640625, "epoch": 2.6053962900505905, "grad_norm": 5.886858445716339, "kl": 0.61328125, "learning_rate": 4.795952782462057e-07, "loss": 0.0006, "reward": 3.5436571836471558, "reward_std": 0.3002474457025528, "rewards/final_reward": 1.548718318359077, "rewards/mask_iou_reward": 0.7743591591795385, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5436573028564453, "rewards/thk_ans_format_reward": 1.0, "step": 1543, "think_completion_length": 44.15625 }, { "clip_ratio": 0.0, "completion_length": 125.84375, "epoch": 2.6070826306913997, "grad_norm": 10.253609083824529, "kl": 0.53125, "learning_rate": 4.792580101180438e-07, "loss": 0.0005, "reward": 3.101767063140869, "reward_std": 0.3000176250934601, "rewards/final_reward": 1.0033951506957848, "rewards/mask_iou_reward": 0.5016975753478924, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1017670631408691, "rewards/thk_ans_format_reward": 1.0, "step": 1544, "think_completion_length": 46.875 }, { "clip_ratio": 0.0, "completion_length": 112.75, "epoch": 2.608768971332209, "grad_norm": 9.753023875173408, "kl": 0.55859375, "learning_rate": 4.789207419898819e-07, "loss": 0.0006, "reward": 3.387078046798706, "reward_std": 0.01357703935354948, "rewards/final_reward": 1.0556593821705356, "rewards/mask_iou_reward": 0.5278296910852678, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3870778679847717, "rewards/thk_ans_format_reward": 1.0, "step": 1545, "think_completion_length": 42.1875 }, { "clip_ratio": 0.0, "completion_length": 108.140625, "epoch": 2.6104553119730185, "grad_norm": 75.76666417873618, "kl": 0.478515625, "learning_rate": 4.7858347386172e-07, "loss": 0.0004, "reward": 3.6486486196517944, "reward_std": 0.2635802363511175, "rewards/final_reward": 1.7562827520153421, "rewards/mask_iou_reward": 0.8781413760076711, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6486486196517944, "rewards/thk_ans_format_reward": 1.0, "step": 1546, "think_completion_length": 46.84375 }, { "clip_ratio": 0.0, "completion_length": 148.53125, "epoch": 2.612141652613828, "grad_norm": 5.692902016421105, "kl": 0.474609375, "learning_rate": 4.782462057335582e-07, "loss": 0.0005, "reward": 3.7076621055603027, "reward_std": 0.046445537358522415, "rewards/final_reward": 1.5944065581111615, "rewards/mask_iou_reward": 0.7972032790555807, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7076621055603027, "rewards/thk_ans_format_reward": 1.0, "step": 1547, "think_completion_length": 41.28125 }, { "clip_ratio": 0.0, "completion_length": 123.15625, "epoch": 2.6138279932546373, "grad_norm": 4.7331556470796725, "kl": 0.53515625, "learning_rate": 4.779089376053963e-07, "loss": 0.0005, "reward": 2.840194821357727, "reward_std": 0.17273178696632385, "rewards/final_reward": 0.8641808244057804, "rewards/mask_iou_reward": 0.4320904122028902, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.840194795280695, "rewards/thk_ans_format_reward": 1.0, "step": 1548, "think_completion_length": 46.375 }, { "clip_ratio": 0.0, "completion_length": 115.5625, "epoch": 2.615514333895447, "grad_norm": 6.113780727882999, "kl": 0.544921875, "learning_rate": 4.775716694772344e-07, "loss": 0.0006, "reward": 3.314778447151184, "reward_std": 0.10576976649463177, "rewards/final_reward": 1.2631982577774228, "rewards/mask_iou_reward": 0.6315991288887114, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3147783279418945, "rewards/thk_ans_format_reward": 1.0, "step": 1549, "think_completion_length": 49.5625 }, { "clip_ratio": 0.0, "completion_length": 115.59375, "epoch": 2.6172006745362566, "grad_norm": 14.729443816776524, "kl": 0.51171875, "learning_rate": 4.772344013490725e-07, "loss": 0.0005, "reward": 3.392350196838379, "reward_std": 0.02547481842339039, "rewards/final_reward": 1.917889459453028, "rewards/mask_iou_reward": 0.958944729726514, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3923503756523132, "rewards/thk_ans_format_reward": 1.0, "step": 1550, "think_completion_length": 46.0625 }, { "clip_ratio": 0.0, "completion_length": 116.453125, "epoch": 2.618887015177066, "grad_norm": 23.575749523820498, "kl": 0.4873046875, "learning_rate": 4.768971332209106e-07, "loss": 0.0005, "reward": 3.108386278152466, "reward_std": 0.16571337264031172, "rewards/final_reward": 0.7428194446241151, "rewards/mask_iou_reward": 0.37140972231205754, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1083862483501434, "rewards/thk_ans_format_reward": 1.0, "step": 1551, "think_completion_length": 45.5 }, { "clip_ratio": 0.0, "completion_length": 114.59375, "epoch": 2.620573355817875, "grad_norm": 25.729924296933714, "kl": 0.4970703125, "learning_rate": 4.765598650927487e-07, "loss": 0.0005, "reward": 3.8241816759109497, "reward_std": 0.2826864686794579, "rewards/final_reward": 1.802022132728133, "rewards/mask_iou_reward": 0.9010110663640665, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.8398066759109497, "rewards/thk_ans_format_reward": 1.0, "step": 1552, "think_completion_length": 44.1875 }, { "clip_ratio": 0.0, "completion_length": 112.515625, "epoch": 2.6222596964586846, "grad_norm": 11.691282009828335, "kl": 0.4853515625, "learning_rate": 4.7622259696458683e-07, "loss": 0.0005, "reward": 3.5969111919403076, "reward_std": 0.09428023174405098, "rewards/final_reward": 1.7918991579440622, "rewards/mask_iou_reward": 0.8959495789720311, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5969111919403076, "rewards/thk_ans_format_reward": 1.0, "step": 1553, "think_completion_length": 43.5625 }, { "clip_ratio": 0.0, "completion_length": 140.234375, "epoch": 2.6239460370994943, "grad_norm": 6.5807467921403, "kl": 0.533203125, "learning_rate": 4.7588532883642497e-07, "loss": 0.0005, "reward": 3.04490065574646, "reward_std": 0.21727034822106361, "rewards/final_reward": 1.1348485780762427, "rewards/mask_iou_reward": 0.5674242890381214, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0449006259441376, "rewards/thk_ans_format_reward": 1.0, "step": 1554, "think_completion_length": 48.0625 }, { "clip_ratio": 0.0, "completion_length": 130.71875, "epoch": 2.6256323777403034, "grad_norm": 16.10521079538983, "kl": 0.623046875, "learning_rate": 4.75548060708263e-07, "loss": 0.0006, "reward": 3.658421516418457, "reward_std": 0.16007404774427414, "rewards/final_reward": 1.7699895470810052, "rewards/mask_iou_reward": 0.8849947735405026, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6584214568138123, "rewards/thk_ans_format_reward": 1.0, "step": 1555, "think_completion_length": 42.625 }, { "clip_ratio": 0.0, "completion_length": 110.78125, "epoch": 2.627318718381113, "grad_norm": 42.81121603125669, "kl": 0.548828125, "learning_rate": 4.7521079258010115e-07, "loss": 0.0006, "reward": 3.4277206659317017, "reward_std": 0.03460780787281692, "rewards/final_reward": 1.1004128676198721, "rewards/mask_iou_reward": 0.5502064338099361, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4277206063270569, "rewards/thk_ans_format_reward": 1.0, "step": 1556, "think_completion_length": 41.59375 }, { "clip_ratio": 0.0, "completion_length": 111.96875, "epoch": 2.6290050590219223, "grad_norm": 11.764793983624923, "kl": 0.54296875, "learning_rate": 4.748735244519393e-07, "loss": 0.0005, "reward": 3.1046032905578613, "reward_std": 0.1467623095959425, "rewards/final_reward": 1.4726183202278702, "rewards/mask_iou_reward": 0.7363091601139351, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.104603350162506, "rewards/thk_ans_format_reward": 1.0, "step": 1557, "think_completion_length": 37.59375 }, { "clip_ratio": 0.0, "completion_length": 121.03125, "epoch": 2.630691399662732, "grad_norm": 7.374056984850983, "kl": 0.521484375, "learning_rate": 4.745362563237774e-07, "loss": 0.0005, "reward": 3.222964286804199, "reward_std": 0.09663717821240425, "rewards/final_reward": 1.8764069665929193, "rewards/mask_iou_reward": 0.9382034832964596, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.222964346408844, "rewards/thk_ans_format_reward": 1.0, "step": 1558, "think_completion_length": 42.46875 }, { "clip_ratio": 0.0, "completion_length": 134.90625, "epoch": 2.632377740303541, "grad_norm": 8.212332172856842, "kl": 0.47265625, "learning_rate": 4.741989881956155e-07, "loss": 0.0005, "reward": 3.1506274938583374, "reward_std": 0.13635646551847458, "rewards/final_reward": 0.6150793550844958, "rewards/mask_iou_reward": 0.3075396775422479, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1506274938583374, "rewards/thk_ans_format_reward": 1.0, "step": 1559, "think_completion_length": 45.6875 }, { "clip_ratio": 0.0, "completion_length": 118.71875, "epoch": 2.6340640809443507, "grad_norm": 10.737499573640928, "kl": 0.501953125, "learning_rate": 4.738617200674536e-07, "loss": 0.0005, "reward": 3.4245011806488037, "reward_std": 0.24318117648363113, "rewards/final_reward": 1.8178093915501898, "rewards/mask_iou_reward": 0.9089046957750949, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.424501121044159, "rewards/thk_ans_format_reward": 1.0, "step": 1560, "think_completion_length": 50.625 }, { "clip_ratio": 0.0, "completion_length": 114.5, "epoch": 2.6357504215851604, "grad_norm": 9.811740705111555, "kl": 0.53125, "learning_rate": 4.735244519392917e-07, "loss": 0.0005, "reward": 3.083121180534363, "reward_std": 0.17737470380961895, "rewards/final_reward": 0.7628667234987518, "rewards/mask_iou_reward": 0.3814333617493759, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0831211805343628, "rewards/thk_ans_format_reward": 1.0, "step": 1561, "think_completion_length": 45.0625 }, { "clip_ratio": 0.0, "completion_length": 116.046875, "epoch": 2.6374367622259696, "grad_norm": 17.93304553748424, "kl": 0.55078125, "learning_rate": 4.7318718381112983e-07, "loss": 0.0006, "reward": 3.072922706604004, "reward_std": 0.24106465280056, "rewards/final_reward": 1.264943024137027, "rewards/mask_iou_reward": 0.6324715120685135, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0729226768016815, "rewards/thk_ans_format_reward": 1.0, "step": 1562, "think_completion_length": 42.28125 }, { "clip_ratio": 0.0, "completion_length": 125.15625, "epoch": 2.639123102866779, "grad_norm": 16.148344624678465, "kl": 0.46875, "learning_rate": 4.7284991568296797e-07, "loss": 0.0005, "reward": 3.426049590110779, "reward_std": 0.09079774469137192, "rewards/final_reward": 1.5492462119283235, "rewards/mask_iou_reward": 0.7746231059641617, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.426049828529358, "rewards/thk_ans_format_reward": 1.0, "step": 1563, "think_completion_length": 38.40625 }, { "clip_ratio": 0.0, "completion_length": 166.984375, "epoch": 2.6408094435075884, "grad_norm": 10.99108255679532, "kl": 0.4794921875, "learning_rate": 4.7251264755480606e-07, "loss": 0.0005, "reward": 3.2649797201156616, "reward_std": 0.2973283752799034, "rewards/final_reward": 0.6567315942814922, "rewards/mask_iou_reward": 0.3283657971407461, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2649796605110168, "rewards/thk_ans_format_reward": 1.0, "step": 1564, "think_completion_length": 46.875 }, { "clip_ratio": 0.0, "completion_length": 113.5625, "epoch": 2.642495784148398, "grad_norm": 9.148717957849211, "kl": 0.537109375, "learning_rate": 4.7217537942664415e-07, "loss": 0.0005, "reward": 3.450225353240967, "reward_std": 0.32389168441295624, "rewards/final_reward": 1.584107655751557, "rewards/mask_iou_reward": 0.7920538278757785, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4502254724502563, "rewards/thk_ans_format_reward": 1.0, "step": 1565, "think_completion_length": 45.8125 }, { "clip_ratio": 0.0, "completion_length": 125.140625, "epoch": 2.6441821247892072, "grad_norm": 10.15309035210223, "kl": 0.466796875, "learning_rate": 4.718381112984823e-07, "loss": 0.0005, "reward": 3.3438801765441895, "reward_std": 0.15512818098068237, "rewards/final_reward": 1.4235350129170727, "rewards/mask_iou_reward": 0.7117675064585364, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3438800573349, "rewards/thk_ans_format_reward": 1.0, "step": 1566, "think_completion_length": 43.6875 }, { "clip_ratio": 0.0, "completion_length": 139.640625, "epoch": 2.645868465430017, "grad_norm": 34.60246296479256, "kl": 0.4931640625, "learning_rate": 4.715008431703204e-07, "loss": 0.0005, "reward": 2.9925897121429443, "reward_std": 0.13927190750837326, "rewards/final_reward": 1.2736035743438077, "rewards/mask_iou_reward": 0.6368017871719038, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.992589682340622, "rewards/thk_ans_format_reward": 1.0, "step": 1567, "think_completion_length": 40.625 }, { "clip_ratio": 0.0, "completion_length": 113.375, "epoch": 2.6475548060708265, "grad_norm": 13.797768570000349, "kl": 0.59375, "learning_rate": 4.7116357504215846e-07, "loss": 0.0006, "reward": 2.680661916732788, "reward_std": 0.19025126099586487, "rewards/final_reward": 0.9316987866921573, "rewards/mask_iou_reward": 0.46584939334607867, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6806618869304657, "rewards/thk_ans_format_reward": 1.0, "step": 1568, "think_completion_length": 39.4375 }, { "clip_ratio": 0.0, "completion_length": 114.890625, "epoch": 2.6492411467116357, "grad_norm": 6.704404939031757, "kl": 0.5263671875, "learning_rate": 4.708263069139966e-07, "loss": 0.0005, "reward": 3.408167600631714, "reward_std": 0.08674982748925686, "rewards/final_reward": 1.8698519267907314, "rewards/mask_iou_reward": 0.9349259633953657, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4081675112247467, "rewards/thk_ans_format_reward": 1.0, "step": 1569, "think_completion_length": 40.25 }, { "clip_ratio": 0.0, "completion_length": 110.375, "epoch": 2.6509274873524453, "grad_norm": 67.84706890476959, "kl": 0.529296875, "learning_rate": 4.7048903878583474e-07, "loss": 0.0005, "reward": 3.11447811126709, "reward_std": 0.1034508217126131, "rewards/final_reward": 1.031368616121662, "rewards/mask_iou_reward": 0.515684308060831, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1144780814647675, "rewards/thk_ans_format_reward": 1.0, "step": 1570, "think_completion_length": 42.875 }, { "clip_ratio": 0.0, "completion_length": 148.5, "epoch": 2.6526138279932545, "grad_norm": 11.334062103906817, "kl": 0.482421875, "learning_rate": 4.7015177065767283e-07, "loss": 0.0005, "reward": 3.4597952365875244, "reward_std": 0.190800953656435, "rewards/final_reward": 1.5933513720971386, "rewards/mask_iou_reward": 0.7966756860485693, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4597952961921692, "rewards/thk_ans_format_reward": 1.0, "step": 1571, "think_completion_length": 42.09375 }, { "clip_ratio": 0.0, "completion_length": 113.40625, "epoch": 2.654300168634064, "grad_norm": 9.567327556714062, "kl": 0.52734375, "learning_rate": 4.698145025295109e-07, "loss": 0.0005, "reward": 3.682429313659668, "reward_std": 0.03618870349600911, "rewards/final_reward": 1.642017994065745, "rewards/mask_iou_reward": 0.8210089970328724, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6824292540550232, "rewards/thk_ans_format_reward": 1.0, "step": 1572, "think_completion_length": 44.71875 }, { "clip_ratio": 0.0, "completion_length": 112.296875, "epoch": 2.6559865092748733, "grad_norm": 6.993710088151398, "kl": 0.546875, "learning_rate": 4.6947723440134906e-07, "loss": 0.0006, "reward": 3.35558819770813, "reward_std": 0.07169377896934748, "rewards/final_reward": 1.0683089924040468, "rewards/mask_iou_reward": 0.5341544962020234, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.355588138103485, "rewards/thk_ans_format_reward": 1.0, "step": 1573, "think_completion_length": 46.4375 }, { "clip_ratio": 0.0, "completion_length": 116.625, "epoch": 2.657672849915683, "grad_norm": 10.336596094703028, "kl": 0.5546875, "learning_rate": 4.6913996627318714e-07, "loss": 0.0006, "reward": 2.707545042037964, "reward_std": 0.15012376755475998, "rewards/final_reward": 1.2545923639349246, "rewards/mask_iou_reward": 0.6272961819674623, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7075448632240295, "rewards/thk_ans_format_reward": 1.0, "step": 1574, "think_completion_length": 46.15625 }, { "clip_ratio": 0.0, "completion_length": 118.59375, "epoch": 2.6593591905564926, "grad_norm": 7.323434814783891, "kl": 0.58984375, "learning_rate": 4.688026981450253e-07, "loss": 0.0006, "reward": 3.3268707990646362, "reward_std": 0.1611822471022606, "rewards/final_reward": 1.0528655309199106, "rewards/mask_iou_reward": 0.5264327654599553, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3268707990646362, "rewards/thk_ans_format_reward": 1.0, "step": 1575, "think_completion_length": 40.71875 }, { "clip_ratio": 0.0, "completion_length": 115.703125, "epoch": 2.661045531197302, "grad_norm": 6.051668551380872, "kl": 0.537109375, "learning_rate": 4.6846543001686337e-07, "loss": 0.0005, "reward": 3.360478639602661, "reward_std": 0.021059296559542418, "rewards/final_reward": 0.9446825785933939, "rewards/mask_iou_reward": 0.47234128929669694, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3604785799980164, "rewards/thk_ans_format_reward": 1.0, "step": 1576, "think_completion_length": 45.96875 }, { "clip_ratio": 0.0, "completion_length": 134.625, "epoch": 2.6627318718381114, "grad_norm": 9.767147695822144, "kl": 0.52734375, "learning_rate": 4.681281618887015e-07, "loss": 0.0005, "reward": 3.1824584007263184, "reward_std": 0.02653807308524847, "rewards/final_reward": 0.8726776630599029, "rewards/mask_iou_reward": 0.43633883152995145, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1824583113193512, "rewards/thk_ans_format_reward": 1.0, "step": 1577, "think_completion_length": 43.3125 }, { "clip_ratio": 0.0, "completion_length": 123.734375, "epoch": 2.6644182124789206, "grad_norm": 6.890106456107886, "kl": 0.6171875, "learning_rate": 4.677908937605396e-07, "loss": 0.0006, "reward": 3.0945212841033936, "reward_std": 0.07227480411529541, "rewards/final_reward": 0.6768194410453567, "rewards/mask_iou_reward": 0.33840972052267837, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0945212841033936, "rewards/thk_ans_format_reward": 1.0, "step": 1578, "think_completion_length": 40.625 }, { "clip_ratio": 0.0, "completion_length": 112.421875, "epoch": 2.6661045531197303, "grad_norm": 11.178278259713263, "kl": 0.54296875, "learning_rate": 4.6745362563237774e-07, "loss": 0.0006, "reward": 2.717486262321472, "reward_std": 0.08585362508893013, "rewards/final_reward": 1.0023938367623468, "rewards/mask_iou_reward": 0.5011969183811734, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7174861431121826, "rewards/thk_ans_format_reward": 1.0, "step": 1579, "think_completion_length": 36.9375 }, { "clip_ratio": 0.0, "completion_length": 121.09375, "epoch": 2.6677908937605395, "grad_norm": 7.912862704188623, "kl": 0.4765625, "learning_rate": 4.6711635750421583e-07, "loss": 0.0005, "reward": 3.138826847076416, "reward_std": 0.030409451574087143, "rewards/final_reward": 1.1384120289812303, "rewards/mask_iou_reward": 0.5692060144906151, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1388267874717712, "rewards/thk_ans_format_reward": 1.0, "step": 1580, "think_completion_length": 37.84375 }, { "clip_ratio": 0.0, "completion_length": 117.796875, "epoch": 2.669477234401349, "grad_norm": 17.902177939828203, "kl": 0.541015625, "learning_rate": 4.667790893760539e-07, "loss": 0.0005, "reward": 2.9601058959960938, "reward_std": 0.08017378486692905, "rewards/final_reward": 0.8861616697236607, "rewards/mask_iou_reward": 0.44308083486183036, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9601059556007385, "rewards/thk_ans_format_reward": 1.0, "step": 1581, "think_completion_length": 41.6875 }, { "clip_ratio": 0.0, "completion_length": 111.6875, "epoch": 2.6711635750421587, "grad_norm": 5.77744769275141, "kl": 0.53125, "learning_rate": 4.6644182124789205e-07, "loss": 0.0006, "reward": 3.140070676803589, "reward_std": 0.02819753671064973, "rewards/final_reward": 1.1835882399632247, "rewards/mask_iou_reward": 0.5917941199816124, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1400706768035889, "rewards/thk_ans_format_reward": 1.0, "step": 1582, "think_completion_length": 43.59375 }, { "clip_ratio": 0.0, "completion_length": 126.65625, "epoch": 2.672849915682968, "grad_norm": 9.166588439312482, "kl": 0.5078125, "learning_rate": 4.661045531197302e-07, "loss": 0.0005, "reward": 3.6381969451904297, "reward_std": 0.0838024877011776, "rewards/final_reward": 1.8922678160683175, "rewards/mask_iou_reward": 0.9461339080341588, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6381970047950745, "rewards/thk_ans_format_reward": 1.0, "step": 1583, "think_completion_length": 43.46875 }, { "clip_ratio": 0.0, "completion_length": 112.140625, "epoch": 2.6745362563237776, "grad_norm": 17.661711314451015, "kl": 0.51953125, "learning_rate": 4.6576728499156823e-07, "loss": 0.0005, "reward": 3.1295218467712402, "reward_std": 0.04637359641492367, "rewards/final_reward": 1.2038188705486585, "rewards/mask_iou_reward": 0.6019094352743293, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1295219659805298, "rewards/thk_ans_format_reward": 1.0, "step": 1584, "think_completion_length": 46.375 }, { "clip_ratio": 0.0, "completion_length": 142.46875, "epoch": 2.6762225969645868, "grad_norm": 6.201859830852071, "kl": 0.5673828125, "learning_rate": 4.6543001686340637e-07, "loss": 0.0006, "reward": 3.565206527709961, "reward_std": 0.22164902091026306, "rewards/final_reward": 1.2804556381149486, "rewards/mask_iou_reward": 0.6402278190574743, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.565206527709961, "rewards/thk_ans_format_reward": 1.0, "step": 1585, "think_completion_length": 44.84375 }, { "clip_ratio": 0.0, "completion_length": 113.15625, "epoch": 2.6779089376053964, "grad_norm": 12.381435115250927, "kl": 0.54296875, "learning_rate": 4.650927487352445e-07, "loss": 0.0005, "reward": 3.632646679878235, "reward_std": 0.09525941498577595, "rewards/final_reward": 1.9017620673478721, "rewards/mask_iou_reward": 0.9508810336739361, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6326467394828796, "rewards/thk_ans_format_reward": 1.0, "step": 1586, "think_completion_length": 44.15625 }, { "clip_ratio": 0.0, "completion_length": 112.984375, "epoch": 2.6795952782462056, "grad_norm": 7.556322139294845, "kl": 0.69921875, "learning_rate": 4.647554806070826e-07, "loss": 0.0007, "reward": 2.895167589187622, "reward_std": 0.04426950961351395, "rewards/final_reward": 1.3905354281615103, "rewards/mask_iou_reward": 0.6952677140807552, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.895167738199234, "rewards/thk_ans_format_reward": 1.0, "step": 1587, "think_completion_length": 41.875 }, { "clip_ratio": 0.0, "completion_length": 114.453125, "epoch": 2.681281618887015, "grad_norm": 5.349816511244455, "kl": 0.5625, "learning_rate": 4.6441821247892074e-07, "loss": 0.0006, "reward": 3.0216450691223145, "reward_std": 0.18535812944173813, "rewards/final_reward": 1.1056129172841695, "rewards/mask_iou_reward": 0.5528064586420848, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0216450691223145, "rewards/thk_ans_format_reward": 1.0, "step": 1588, "think_completion_length": 46.0625 }, { "clip_ratio": 0.0, "completion_length": 114.265625, "epoch": 2.682967959527825, "grad_norm": 8.617515731302568, "kl": 0.5234375, "learning_rate": 4.640809443507588e-07, "loss": 0.0005, "reward": 2.9087109565734863, "reward_std": 0.029049073811620474, "rewards/final_reward": 0.6989310824756831, "rewards/mask_iou_reward": 0.34946554123784157, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9087110161781311, "rewards/thk_ans_format_reward": 1.0, "step": 1589, "think_completion_length": 46.28125 }, { "clip_ratio": 0.0, "completion_length": 115.53125, "epoch": 2.684654300168634, "grad_norm": 6.251207462853013, "kl": 0.54296875, "learning_rate": 4.6374367622259697e-07, "loss": 0.0005, "reward": 3.040684938430786, "reward_std": 0.2383800894021988, "rewards/final_reward": 1.0378911364396055, "rewards/mask_iou_reward": 0.5189455682198028, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0406849384307861, "rewards/thk_ans_format_reward": 1.0, "step": 1590, "think_completion_length": 45.03125 }, { "clip_ratio": 0.0, "completion_length": 114.609375, "epoch": 2.6863406408094432, "grad_norm": 14.302390776245351, "kl": 0.51171875, "learning_rate": 4.6340640809443505e-07, "loss": 0.0005, "reward": 3.3956209421157837, "reward_std": 0.08094323147088289, "rewards/final_reward": 1.1523773228814034, "rewards/mask_iou_reward": 0.5761886614407017, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3956209421157837, "rewards/thk_ans_format_reward": 1.0, "step": 1591, "think_completion_length": 38.53125 }, { "clip_ratio": 0.0, "completion_length": 123.59375, "epoch": 2.688026981450253, "grad_norm": 10.327695067737794, "kl": 0.51953125, "learning_rate": 4.630691399662732e-07, "loss": 0.0005, "reward": 3.3911901712417603, "reward_std": 0.015237356536090374, "rewards/final_reward": 1.8107498385321739, "rewards/mask_iou_reward": 0.9053749192660869, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.391190081834793, "rewards/thk_ans_format_reward": 1.0, "step": 1592, "think_completion_length": 44.21875 }, { "clip_ratio": 0.0, "completion_length": 113.640625, "epoch": 2.6897133220910625, "grad_norm": 4.515188568828064, "kl": 0.564453125, "learning_rate": 4.627318718381113e-07, "loss": 0.0005, "reward": 3.6611695289611816, "reward_std": 0.010300178895704448, "rewards/final_reward": 1.5551834657420902, "rewards/mask_iou_reward": 0.7775917328710451, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6611695885658264, "rewards/thk_ans_format_reward": 1.0, "step": 1593, "think_completion_length": 45.25 }, { "clip_ratio": 0.0, "completion_length": 116.0625, "epoch": 2.6913996627318717, "grad_norm": 9.967779718750911, "kl": 0.5546875, "learning_rate": 4.6239460370994937e-07, "loss": 0.0005, "reward": 3.6303428411483765, "reward_std": 0.13057681638747454, "rewards/final_reward": 1.3867047517088262, "rewards/mask_iou_reward": 0.6933523758544131, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6303430199623108, "rewards/thk_ans_format_reward": 1.0, "step": 1594, "think_completion_length": 46.84375 }, { "clip_ratio": 0.0, "completion_length": 155.328125, "epoch": 2.6930860033726813, "grad_norm": 7.092355396036124, "kl": 0.5126953125, "learning_rate": 4.620573355817875e-07, "loss": 0.0005, "reward": 3.195076107978821, "reward_std": 0.20519106090068817, "rewards/final_reward": 0.6965408540739686, "rewards/mask_iou_reward": 0.3482704270369843, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1950761079788208, "rewards/thk_ans_format_reward": 1.0, "step": 1595, "think_completion_length": 39.5625 }, { "clip_ratio": 0.0, "completion_length": 114.40625, "epoch": 2.694772344013491, "grad_norm": 5.681443841176162, "kl": 0.666015625, "learning_rate": 4.6172006745362565e-07, "loss": 0.0007, "reward": 3.520709276199341, "reward_std": 0.10630087740719318, "rewards/final_reward": 1.224463961360859, "rewards/mask_iou_reward": 0.6122319806804295, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5207092761993408, "rewards/thk_ans_format_reward": 1.0, "step": 1596, "think_completion_length": 43.6875 }, { "clip_ratio": 0.0, "completion_length": 118.1875, "epoch": 2.6964586846543, "grad_norm": 26.572470268498073, "kl": 0.591796875, "learning_rate": 4.613827993254637e-07, "loss": 0.0006, "reward": 3.334014058113098, "reward_std": 0.11482397792860866, "rewards/final_reward": 1.4160978681431455, "rewards/mask_iou_reward": 0.7080489340715728, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.334014117717743, "rewards/thk_ans_format_reward": 1.0, "step": 1597, "think_completion_length": 48.90625 }, { "clip_ratio": 0.0, "completion_length": 152.609375, "epoch": 2.6981450252951094, "grad_norm": 20.336568028235305, "kl": 0.556640625, "learning_rate": 4.610455311973018e-07, "loss": 0.0006, "reward": 3.5459563732147217, "reward_std": 0.05063344561494887, "rewards/final_reward": 1.6638033763740356, "rewards/mask_iou_reward": 0.8319016881870178, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.545956313610077, "rewards/thk_ans_format_reward": 1.0, "step": 1598, "think_completion_length": 46.625 }, { "clip_ratio": 0.0, "completion_length": 111.640625, "epoch": 2.699831365935919, "grad_norm": 6.445917565428539, "kl": 0.626953125, "learning_rate": 4.6070826306913996e-07, "loss": 0.0006, "reward": 3.5455654859542847, "reward_std": 0.15113668888807297, "rewards/final_reward": 1.9026466968239584, "rewards/mask_iou_reward": 0.9513233484119792, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.545565664768219, "rewards/thk_ans_format_reward": 1.0, "step": 1599, "think_completion_length": 41.65625 }, { "clip_ratio": 0.0, "completion_length": 143.125, "epoch": 2.7015177065767286, "grad_norm": 9.59769819423389, "kl": 0.494140625, "learning_rate": 4.6037099494097805e-07, "loss": 0.0005, "reward": 3.7451967000961304, "reward_std": 0.15041500329971313, "rewards/final_reward": 1.5559414480541731, "rewards/mask_iou_reward": 0.7779707240270866, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.74519681930542, "rewards/thk_ans_format_reward": 1.0, "step": 1600, "think_completion_length": 48.0625 }, { "clip_ratio": 0.0, "completion_length": 124.28125, "epoch": 2.703204047217538, "grad_norm": 46.21311513255855, "kl": 0.552734375, "learning_rate": 4.6003372681281614e-07, "loss": 0.0005, "reward": 2.6555440425872803, "reward_std": 0.13194021955132484, "rewards/final_reward": 1.1682005930727084, "rewards/mask_iou_reward": 0.5841002965363542, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.655544102191925, "rewards/thk_ans_format_reward": 1.0, "step": 1601, "think_completion_length": 44.5625 }, { "clip_ratio": 0.0, "completion_length": 177.625, "epoch": 2.7048903878583475, "grad_norm": 11.008089396246493, "kl": 0.5478515625, "learning_rate": 4.596964586846543e-07, "loss": 0.0005, "reward": 3.157100558280945, "reward_std": 0.1843406707048416, "rewards/final_reward": 1.3733449605594634, "rewards/mask_iou_reward": 0.6866724802797317, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1571004986763, "rewards/thk_ans_format_reward": 1.0, "step": 1602, "think_completion_length": 42.90625 }, { "clip_ratio": 0.0, "completion_length": 113.5625, "epoch": 2.706576728499157, "grad_norm": 6.757045349435166, "kl": 0.58203125, "learning_rate": 4.593591905564924e-07, "loss": 0.0006, "reward": 3.094280958175659, "reward_std": 0.06427431292831898, "rewards/final_reward": 0.8938852405476458, "rewards/mask_iou_reward": 0.4469426202738229, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0942809730768204, "rewards/thk_ans_format_reward": 1.0, "step": 1603, "think_completion_length": 44.96875 }, { "clip_ratio": 0.0, "completion_length": 110.296875, "epoch": 2.7082630691399663, "grad_norm": 6.581458185650574, "kl": 0.560546875, "learning_rate": 4.590219224283305e-07, "loss": 0.0006, "reward": 3.599763035774231, "reward_std": 0.09732984844595194, "rewards/final_reward": 1.4477912477637431, "rewards/mask_iou_reward": 0.7238956238818716, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5997629761695862, "rewards/thk_ans_format_reward": 1.0, "step": 1604, "think_completion_length": 38.5625 }, { "clip_ratio": 0.0, "completion_length": 159.890625, "epoch": 2.7099494097807755, "grad_norm": 8.227960755990702, "kl": 0.48828125, "learning_rate": 4.586846543001686e-07, "loss": 0.0005, "reward": 3.294048309326172, "reward_std": 0.19548258185386658, "rewards/final_reward": 1.0749916780010045, "rewards/mask_iou_reward": 0.5374958390005022, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.294048249721527, "rewards/thk_ans_format_reward": 1.0, "step": 1605, "think_completion_length": 45.3125 }, { "clip_ratio": 0.0, "completion_length": 113.65625, "epoch": 2.711635750421585, "grad_norm": 13.268237610277964, "kl": 0.4794921875, "learning_rate": 4.5834738617200673e-07, "loss": 0.0005, "reward": 3.5636035203933716, "reward_std": 0.07341088191606104, "rewards/final_reward": 1.6283072965909686, "rewards/mask_iou_reward": 0.8141536482954843, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.563603401184082, "rewards/thk_ans_format_reward": 1.0, "step": 1606, "think_completion_length": 46.25 }, { "clip_ratio": 0.0, "completion_length": 126.171875, "epoch": 2.7133220910623947, "grad_norm": 11.23746727458345, "kl": 0.509765625, "learning_rate": 4.580101180438448e-07, "loss": 0.0005, "reward": 2.89646053314209, "reward_std": 0.2256901040673256, "rewards/final_reward": 1.0191730065477667, "rewards/mask_iou_reward": 0.5095865032738833, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8964604437351227, "rewards/thk_ans_format_reward": 1.0, "step": 1607, "think_completion_length": 45.8125 }, { "clip_ratio": 0.0, "completion_length": 119.734375, "epoch": 2.715008431703204, "grad_norm": 11.332571852572475, "kl": 0.671875, "learning_rate": 4.5767284991568296e-07, "loss": 0.0007, "reward": 2.9931583404541016, "reward_std": 0.16069792211055756, "rewards/final_reward": 0.6567371198659422, "rewards/mask_iou_reward": 0.3283685599329711, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9931585192680359, "rewards/thk_ans_format_reward": 1.0, "step": 1608, "think_completion_length": 46.90625 }, { "clip_ratio": 0.0, "completion_length": 113.78125, "epoch": 2.7166947723440136, "grad_norm": 20.350092315088784, "kl": 0.6484375, "learning_rate": 4.5733558178752105e-07, "loss": 0.0006, "reward": 3.3594272136688232, "reward_std": 0.049404000863432884, "rewards/final_reward": 1.0703260453871735, "rewards/mask_iou_reward": 0.5351630226935867, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3594273328781128, "rewards/thk_ans_format_reward": 1.0, "step": 1609, "think_completion_length": 41.5 }, { "clip_ratio": 0.0, "completion_length": 114.359375, "epoch": 2.718381112984823, "grad_norm": 11.614624529345251, "kl": 0.541015625, "learning_rate": 4.5699831365935914e-07, "loss": 0.0005, "reward": 3.3027396202087402, "reward_std": 0.12427800334990025, "rewards/final_reward": 1.7234073373638734, "rewards/mask_iou_reward": 0.8617036686819367, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3027397394180298, "rewards/thk_ans_format_reward": 1.0, "step": 1610, "think_completion_length": 45.875 }, { "clip_ratio": 0.0, "completion_length": 121.546875, "epoch": 2.7200674536256324, "grad_norm": 6.353380378963483, "kl": 0.505859375, "learning_rate": 4.566610455311973e-07, "loss": 0.0005, "reward": 3.120621681213379, "reward_std": 0.2408496029675007, "rewards/final_reward": 1.1250674417261752, "rewards/mask_iou_reward": 0.5625337208630876, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.12062169611454, "rewards/thk_ans_format_reward": 1.0, "step": 1611, "think_completion_length": 51.4375 }, { "clip_ratio": 0.0, "completion_length": 113.203125, "epoch": 2.7217537942664416, "grad_norm": 4.3232466439197115, "kl": 0.5234375, "learning_rate": 4.563237774030354e-07, "loss": 0.0005, "reward": 2.8183436393737793, "reward_std": 0.019991028821095824, "rewards/final_reward": 0.6046746524383444, "rewards/mask_iou_reward": 0.3023373262191722, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8183438181877136, "rewards/thk_ans_format_reward": 1.0, "step": 1612, "think_completion_length": 44.96875 }, { "clip_ratio": 0.0, "completion_length": 122.046875, "epoch": 2.7234401349072512, "grad_norm": 6.560569368733555, "kl": 0.466796875, "learning_rate": 4.5598650927487345e-07, "loss": 0.0005, "reward": 3.509775757789612, "reward_std": 0.020152635872364044, "rewards/final_reward": 1.8637907902287016, "rewards/mask_iou_reward": 0.9318953951143508, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5097758769989014, "rewards/thk_ans_format_reward": 1.0, "step": 1613, "think_completion_length": 37.3125 }, { "clip_ratio": 0.0, "completion_length": 120.9375, "epoch": 2.725126475548061, "grad_norm": 29.48749989127377, "kl": 0.4638671875, "learning_rate": 4.556492411467116e-07, "loss": 0.0005, "reward": 3.187331199645996, "reward_std": 0.3615667298436165, "rewards/final_reward": 0.7844147741617697, "rewards/mask_iou_reward": 0.3922073870808849, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1873311400413513, "rewards/thk_ans_format_reward": 1.0, "step": 1614, "think_completion_length": 54.0 }, { "clip_ratio": 0.0, "completion_length": 109.359375, "epoch": 2.72681281618887, "grad_norm": 7.455100685242957, "kl": 0.595703125, "learning_rate": 4.5531197301854973e-07, "loss": 0.0006, "reward": 3.0750458240509033, "reward_std": 0.1224330198019743, "rewards/final_reward": 0.8567996114915193, "rewards/mask_iou_reward": 0.42839980574575964, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0750460028648376, "rewards/thk_ans_format_reward": 1.0, "step": 1615, "think_completion_length": 39.1875 }, { "clip_ratio": 0.0, "completion_length": 113.140625, "epoch": 2.7284991568296797, "grad_norm": 617.4205143388256, "kl": 1.263671875, "learning_rate": 4.5497470489038787e-07, "loss": 0.0013, "reward": 3.344548225402832, "reward_std": 0.13941496424376965, "rewards/final_reward": 1.4527047944685108, "rewards/mask_iou_reward": 0.7263523972342554, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3445482850074768, "rewards/thk_ans_format_reward": 1.0, "step": 1616, "think_completion_length": 39.75 }, { "clip_ratio": 0.0, "completion_length": 144.28125, "epoch": 2.730185497470489, "grad_norm": 5.885694035409707, "kl": 0.53125, "learning_rate": 4.5463743676222596e-07, "loss": 0.0005, "reward": 3.196670651435852, "reward_std": 0.0907645896077156, "rewards/final_reward": 1.4354340157265555, "rewards/mask_iou_reward": 0.7177170078632777, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1966705918312073, "rewards/thk_ans_format_reward": 1.0, "step": 1617, "think_completion_length": 47.25 }, { "clip_ratio": 0.0, "completion_length": 116.9375, "epoch": 2.7318718381112985, "grad_norm": 11.887463663477943, "kl": 1.025390625, "learning_rate": 4.5430016863406405e-07, "loss": 0.001, "reward": 3.2308313846588135, "reward_std": 0.17940062656998634, "rewards/final_reward": 1.219334363125589, "rewards/mask_iou_reward": 0.6096671815627945, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.230831354856491, "rewards/thk_ans_format_reward": 1.0, "step": 1618, "think_completion_length": 48.6875 }, { "clip_ratio": 0.0, "completion_length": 115.15625, "epoch": 2.7335581787521077, "grad_norm": 5.53371551554527, "kl": 0.58203125, "learning_rate": 4.539629005059022e-07, "loss": 0.0006, "reward": 3.69223952293396, "reward_std": 0.06898763962090015, "rewards/final_reward": 1.6759169513149303, "rewards/mask_iou_reward": 0.8379584756574652, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.69223952293396, "rewards/thk_ans_format_reward": 1.0, "step": 1619, "think_completion_length": 43.28125 }, { "clip_ratio": 0.0, "completion_length": 120.1875, "epoch": 2.7352445193929174, "grad_norm": 18.416952301844045, "kl": 0.517578125, "learning_rate": 4.536256323777403e-07, "loss": 0.0005, "reward": 3.3278738260269165, "reward_std": 0.1171044334769249, "rewards/final_reward": 0.9361255570934764, "rewards/mask_iou_reward": 0.4680627785467382, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3278738260269165, "rewards/thk_ans_format_reward": 1.0, "step": 1620, "think_completion_length": 42.09375 }, { "clip_ratio": 0.0, "completion_length": 117.1875, "epoch": 2.736930860033727, "grad_norm": 10.515013413730191, "kl": 0.5546875, "learning_rate": 4.532883642495784e-07, "loss": 0.0006, "reward": 3.530287265777588, "reward_std": 0.21374469250440598, "rewards/final_reward": 1.4084952224038, "rewards/mask_iou_reward": 0.7042476112019, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5302872061729431, "rewards/thk_ans_format_reward": 1.0, "step": 1621, "think_completion_length": 45.71875 }, { "clip_ratio": 0.0, "completion_length": 113.359375, "epoch": 2.738617200674536, "grad_norm": 10.898962457623472, "kl": 0.537109375, "learning_rate": 4.529510961214165e-07, "loss": 0.0005, "reward": 3.6671559810638428, "reward_std": 0.06463497970253229, "rewards/final_reward": 1.7504511102408151, "rewards/mask_iou_reward": 0.8752255551204076, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6671560406684875, "rewards/thk_ans_format_reward": 1.0, "step": 1622, "think_completion_length": 40.65625 }, { "clip_ratio": 0.0, "completion_length": 131.34375, "epoch": 2.740303541315346, "grad_norm": 8.048926304189164, "kl": 0.5205078125, "learning_rate": 4.526138279932546e-07, "loss": 0.0005, "reward": 3.449007511138916, "reward_std": 0.1426835972815752, "rewards/final_reward": 1.6894874620838656, "rewards/mask_iou_reward": 0.8447437310419328, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.449007511138916, "rewards/thk_ans_format_reward": 1.0, "step": 1623, "think_completion_length": 48.09375 }, { "clip_ratio": 0.0, "completion_length": 112.046875, "epoch": 2.741989881956155, "grad_norm": 6.472851906577908, "kl": 0.564453125, "learning_rate": 4.5227655986509273e-07, "loss": 0.0006, "reward": 3.6451568603515625, "reward_std": 0.2593112513422966, "rewards/final_reward": 1.4224126006269602, "rewards/mask_iou_reward": 0.7112063003134801, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.660781741142273, "rewards/thk_ans_format_reward": 1.0, "step": 1624, "think_completion_length": 42.125 }, { "clip_ratio": 0.0, "completion_length": 126.140625, "epoch": 2.7436762225969646, "grad_norm": 10.19352056567603, "kl": 0.51953125, "learning_rate": 4.5193929173693087e-07, "loss": 0.0005, "reward": 3.6311769485473633, "reward_std": 0.016571541782468557, "rewards/final_reward": 1.8216684584209872, "rewards/mask_iou_reward": 0.9108342292104936, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6311771273612976, "rewards/thk_ans_format_reward": 1.0, "step": 1625, "think_completion_length": 43.4375 }, { "clip_ratio": 0.0, "completion_length": 154.890625, "epoch": 2.745362563237774, "grad_norm": 7.9582142441158545, "kl": 0.5234375, "learning_rate": 4.5160202360876896e-07, "loss": 0.0005, "reward": 3.2092288732528687, "reward_std": 0.1798749640583992, "rewards/final_reward": 1.476282555361136, "rewards/mask_iou_reward": 0.738141277680568, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2092288732528687, "rewards/thk_ans_format_reward": 1.0, "step": 1626, "think_completion_length": 45.15625 }, { "clip_ratio": 0.0, "completion_length": 112.203125, "epoch": 2.7470489038785835, "grad_norm": 10.413865750913033, "kl": 0.564453125, "learning_rate": 4.5126475548060705e-07, "loss": 0.0006, "reward": 3.249502658843994, "reward_std": 0.116750568151474, "rewards/final_reward": 1.4793633630563305, "rewards/mask_iou_reward": 0.7396816815281653, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2495025396347046, "rewards/thk_ans_format_reward": 1.0, "step": 1627, "think_completion_length": 41.21875 }, { "clip_ratio": 0.0, "completion_length": 115.78125, "epoch": 2.748735244519393, "grad_norm": 14.680895975342848, "kl": 0.509765625, "learning_rate": 4.509274873524452e-07, "loss": 0.0005, "reward": 3.1441562175750732, "reward_std": 0.0737368743866682, "rewards/final_reward": 1.3852608388558878, "rewards/mask_iou_reward": 0.6926304194279439, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1441562175750732, "rewards/thk_ans_format_reward": 1.0, "step": 1628, "think_completion_length": 38.5 }, { "clip_ratio": 0.0, "completion_length": 107.296875, "epoch": 2.7504215851602023, "grad_norm": 9.618197850292306, "kl": 0.625, "learning_rate": 4.5059021922428333e-07, "loss": 0.0006, "reward": 3.2445462942123413, "reward_std": 0.2222440093755722, "rewards/final_reward": 1.501265198198892, "rewards/mask_iou_reward": 0.750632599099446, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2445462942123413, "rewards/thk_ans_format_reward": 1.0, "step": 1629, "think_completion_length": 36.46875 }, { "clip_ratio": 0.0, "completion_length": 114.71875, "epoch": 2.752107925801012, "grad_norm": 8.92522300504253, "kl": 0.56640625, "learning_rate": 4.5025295109612136e-07, "loss": 0.0006, "reward": 3.412382483482361, "reward_std": 0.32697246968746185, "rewards/final_reward": 1.2967872210805327, "rewards/mask_iou_reward": 0.6483936105402663, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.4280074834823608, "rewards/thk_ans_format_reward": 1.0, "step": 1630, "think_completion_length": 46.0 }, { "clip_ratio": 0.0, "completion_length": 112.578125, "epoch": 2.753794266441821, "grad_norm": 9.728732780329596, "kl": 0.541015625, "learning_rate": 4.499156829679595e-07, "loss": 0.0004, "reward": 3.712798237800598, "reward_std": 0.14024843752849847, "rewards/final_reward": 1.5205530420294848, "rewards/mask_iou_reward": 0.7602765210147424, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7127981185913086, "rewards/thk_ans_format_reward": 1.0, "step": 1631, "think_completion_length": 45.21875 }, { "clip_ratio": 0.0, "completion_length": 112.484375, "epoch": 2.7554806070826308, "grad_norm": 55.711069059492566, "kl": 0.548828125, "learning_rate": 4.4957841483979764e-07, "loss": 0.0005, "reward": 3.454300045967102, "reward_std": 0.052289645187556744, "rewards/final_reward": 1.872316458665721, "rewards/mask_iou_reward": 0.9361582293328605, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4542999267578125, "rewards/thk_ans_format_reward": 1.0, "step": 1632, "think_completion_length": 45.375 }, { "clip_ratio": 0.0, "completion_length": 112.703125, "epoch": 2.75716694772344, "grad_norm": 12.41641458988822, "kl": 0.572265625, "learning_rate": 4.4924114671163573e-07, "loss": 0.0006, "reward": 3.2182466983795166, "reward_std": 0.08807303197681904, "rewards/final_reward": 0.8138171199638627, "rewards/mask_iou_reward": 0.4069085599819314, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2182466089725494, "rewards/thk_ans_format_reward": 1.0, "step": 1633, "think_completion_length": 39.34375 }, { "clip_ratio": 0.0, "completion_length": 117.21875, "epoch": 2.7588532883642496, "grad_norm": 5.889717675375059, "kl": 0.552734375, "learning_rate": 4.489038785834738e-07, "loss": 0.0006, "reward": 2.974811315536499, "reward_std": 0.06643060594797134, "rewards/final_reward": 1.0631971480457183, "rewards/mask_iou_reward": 0.5315985740228591, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9748111963272095, "rewards/thk_ans_format_reward": 1.0, "step": 1634, "think_completion_length": 46.5625 }, { "clip_ratio": 0.0, "completion_length": 114.203125, "epoch": 2.7605396290050592, "grad_norm": 14.3730869784043, "kl": 0.513671875, "learning_rate": 4.4856661045531196e-07, "loss": 0.0005, "reward": 3.477365493774414, "reward_std": 0.3136833906173706, "rewards/final_reward": 1.4492890341658577, "rewards/mask_iou_reward": 0.7246445170829289, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.477365493774414, "rewards/thk_ans_format_reward": 1.0, "step": 1635, "think_completion_length": 48.0625 }, { "clip_ratio": 0.0, "completion_length": 122.0, "epoch": 2.7622259696458684, "grad_norm": 9.270564936349368, "kl": 0.564453125, "learning_rate": 4.4822934232715004e-07, "loss": 0.0006, "reward": 2.9075610637664795, "reward_std": 0.13323557563126087, "rewards/final_reward": 1.5394845428329393, "rewards/mask_iou_reward": 0.7697422714164697, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9075611233711243, "rewards/thk_ans_format_reward": 1.0, "step": 1636, "think_completion_length": 48.90625 }, { "clip_ratio": 0.0, "completion_length": 115.859375, "epoch": 2.763912310286678, "grad_norm": 9.59060155091405, "kl": 0.55859375, "learning_rate": 4.478920741989882e-07, "loss": 0.0006, "reward": 3.169069766998291, "reward_std": 0.24465776793658733, "rewards/final_reward": 1.1316411755875047, "rewards/mask_iou_reward": 0.5658205877937523, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.169069766998291, "rewards/thk_ans_format_reward": 1.0, "step": 1637, "think_completion_length": 45.25 }, { "clip_ratio": 0.0, "completion_length": 128.453125, "epoch": 2.7655986509274872, "grad_norm": 7.833469504528331, "kl": 0.486328125, "learning_rate": 4.4755480607082627e-07, "loss": 0.0005, "reward": 3.5080454349517822, "reward_std": 0.044747334672138095, "rewards/final_reward": 1.8708290131033436, "rewards/mask_iou_reward": 0.9354145065516718, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5080453753471375, "rewards/thk_ans_format_reward": 1.0, "step": 1638, "think_completion_length": 42.53125 }, { "clip_ratio": 0.0, "completion_length": 114.21875, "epoch": 2.767284991568297, "grad_norm": 20.33564956478746, "kl": 0.7431640625, "learning_rate": 4.472175379426644e-07, "loss": 0.0007, "reward": 3.184965491294861, "reward_std": 0.13730136305093765, "rewards/final_reward": 1.3746444918404204, "rewards/mask_iou_reward": 0.6873222459202102, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.184965431690216, "rewards/thk_ans_format_reward": 1.0, "step": 1639, "think_completion_length": 47.6875 }, { "clip_ratio": 0.0, "completion_length": 130.46875, "epoch": 2.768971332209106, "grad_norm": 9.139679927371628, "kl": 0.537109375, "learning_rate": 4.468802698145025e-07, "loss": 0.0005, "reward": 3.26485538482666, "reward_std": 0.2971716374158859, "rewards/final_reward": 0.9064043195540362, "rewards/mask_iou_reward": 0.4532021597770181, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.2804805040359497, "rewards/thk_ans_format_reward": 1.0, "step": 1640, "think_completion_length": 43.8125 }, { "clip_ratio": 0.0, "completion_length": 115.125, "epoch": 2.7706576728499157, "grad_norm": 28.423688155899523, "kl": 0.58203125, "learning_rate": 4.4654300168634064e-07, "loss": 0.0006, "reward": 3.2505204677581787, "reward_std": 0.08370211534202099, "rewards/final_reward": 0.9374799320219221, "rewards/mask_iou_reward": 0.4687399660109611, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2505203485488892, "rewards/thk_ans_format_reward": 1.0, "step": 1641, "think_completion_length": 43.3125 }, { "clip_ratio": 0.0, "completion_length": 113.09375, "epoch": 2.7723440134907253, "grad_norm": 48.70271629740253, "kl": 0.568359375, "learning_rate": 4.462057335581788e-07, "loss": 0.0006, "reward": 3.169440746307373, "reward_std": 0.0842177951708436, "rewards/final_reward": 1.3710371271332886, "rewards/mask_iou_reward": 0.6855185635666443, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1694406569004059, "rewards/thk_ans_format_reward": 1.0, "step": 1642, "think_completion_length": 43.5625 }, { "clip_ratio": 0.0, "completion_length": 115.734375, "epoch": 2.7740303541315345, "grad_norm": 7.366441989581299, "kl": 0.5625, "learning_rate": 4.458684654300168e-07, "loss": 0.0006, "reward": 3.6222939491271973, "reward_std": 0.03713347762823105, "rewards/final_reward": 1.5904692873556985, "rewards/mask_iou_reward": 0.7952346436778492, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6222938299179077, "rewards/thk_ans_format_reward": 1.0, "step": 1643, "think_completion_length": 43.5625 }, { "clip_ratio": 0.0, "completion_length": 118.25, "epoch": 2.775716694772344, "grad_norm": 9.447110316239264, "kl": 0.626953125, "learning_rate": 4.4553119730185496e-07, "loss": 0.0006, "reward": 3.4532470703125, "reward_std": 0.20233439654111862, "rewards/final_reward": 1.3015430654140039, "rewards/mask_iou_reward": 0.6507715327070019, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4532470703125, "rewards/thk_ans_format_reward": 1.0, "step": 1644, "think_completion_length": 41.53125 }, { "clip_ratio": 0.0, "completion_length": 123.703125, "epoch": 2.7774030354131534, "grad_norm": 13.954279040230144, "kl": 0.4765625, "learning_rate": 4.451939291736931e-07, "loss": 0.0005, "reward": 3.377991557121277, "reward_std": 0.18850401416420937, "rewards/final_reward": 1.3938890227534597, "rewards/mask_iou_reward": 0.6969445113767299, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3779913783073425, "rewards/thk_ans_format_reward": 1.0, "step": 1645, "think_completion_length": 44.90625 }, { "clip_ratio": 0.0, "completion_length": 100.015625, "epoch": 2.779089376053963, "grad_norm": 22.690826871154588, "kl": 0.548828125, "learning_rate": 4.4485666104553113e-07, "loss": 0.0005, "reward": 3.6439974308013916, "reward_std": 0.18691938370466232, "rewards/final_reward": 1.3127684972547642, "rewards/mask_iou_reward": 0.6563842486273821, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6439975500106812, "rewards/thk_ans_format_reward": 1.0, "step": 1646, "think_completion_length": 43.9375 }, { "clip_ratio": 0.0, "completion_length": 126.4375, "epoch": 2.780775716694772, "grad_norm": 5.949871279661459, "kl": 0.5390625, "learning_rate": 4.4451939291736927e-07, "loss": 0.0005, "reward": 3.5799875259399414, "reward_std": 0.06551541201770306, "rewards/final_reward": 1.5637152935744738, "rewards/mask_iou_reward": 0.7818576467872369, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.579987347126007, "rewards/thk_ans_format_reward": 1.0, "step": 1647, "think_completion_length": 48.3125 }, { "clip_ratio": 0.0, "completion_length": 122.0625, "epoch": 2.782462057335582, "grad_norm": 7.5002904571187665, "kl": 0.60546875, "learning_rate": 4.441821247892074e-07, "loss": 0.0006, "reward": 3.5384901762008667, "reward_std": 0.10225693881511688, "rewards/final_reward": 1.243734327595926, "rewards/mask_iou_reward": 0.621867163797963, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.538490116596222, "rewards/thk_ans_format_reward": 1.0, "step": 1648, "think_completion_length": 44.59375 }, { "clip_ratio": 0.0, "completion_length": 116.515625, "epoch": 2.7841483979763915, "grad_norm": 11.423476860723076, "kl": 0.505859375, "learning_rate": 4.438448566610455e-07, "loss": 0.0005, "reward": 3.378657102584839, "reward_std": 0.011581235099583864, "rewards/final_reward": 1.1859685848692345, "rewards/mask_iou_reward": 0.5929842924346173, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.378657042980194, "rewards/thk_ans_format_reward": 1.0, "step": 1649, "think_completion_length": 47.8125 }, { "clip_ratio": 0.0, "completion_length": 140.265625, "epoch": 2.7858347386172007, "grad_norm": 7.507930542476445, "kl": 0.525390625, "learning_rate": 4.4350758853288364e-07, "loss": 0.0009, "reward": 3.132522940635681, "reward_std": 0.04552896483801305, "rewards/final_reward": 0.7150072786373457, "rewards/mask_iou_reward": 0.35750363931867285, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1325227916240692, "rewards/thk_ans_format_reward": 1.0, "step": 1650, "think_completion_length": 52.34375 }, { "clip_ratio": 0.0, "completion_length": 114.90625, "epoch": 2.78752107925801, "grad_norm": 8.212728271467169, "kl": 0.546875, "learning_rate": 4.431703204047217e-07, "loss": 0.0005, "reward": 3.3425687551498413, "reward_std": 0.1770862564444542, "rewards/final_reward": 1.2308162768632087, "rewards/mask_iou_reward": 0.6154081384316044, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3425687551498413, "rewards/thk_ans_format_reward": 1.0, "step": 1651, "think_completion_length": 43.0625 }, { "clip_ratio": 0.0, "completion_length": 135.265625, "epoch": 2.7892074198988195, "grad_norm": 10.726955125370036, "kl": 0.46875, "learning_rate": 4.4283305227655987e-07, "loss": 0.0005, "reward": 3.3672693967819214, "reward_std": 0.09276960045099258, "rewards/final_reward": 1.5693248195881653, "rewards/mask_iou_reward": 0.7846624097940826, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3672692775726318, "rewards/thk_ans_format_reward": 1.0, "step": 1652, "think_completion_length": 52.65625 }, { "clip_ratio": 0.0, "completion_length": 130.78125, "epoch": 2.790893760539629, "grad_norm": 20.126983932706253, "kl": 0.5302734375, "learning_rate": 4.4249578414839795e-07, "loss": 0.0005, "reward": 2.6683385372161865, "reward_std": 0.08216170221567154, "rewards/final_reward": 0.280023642755239, "rewards/mask_iou_reward": 0.1400118213776195, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6683385670185089, "rewards/thk_ans_format_reward": 1.0, "step": 1653, "think_completion_length": 45.3125 }, { "clip_ratio": 0.0, "completion_length": 163.390625, "epoch": 2.7925801011804383, "grad_norm": 9.48556505248788, "kl": 0.46484375, "learning_rate": 4.421585160202361e-07, "loss": 0.0005, "reward": 3.599041700363159, "reward_std": 0.22133435308933258, "rewards/final_reward": 1.5422429531382462, "rewards/mask_iou_reward": 0.7711214765691231, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5990417003631592, "rewards/thk_ans_format_reward": 1.0, "step": 1654, "think_completion_length": 48.59375 }, { "clip_ratio": 0.0, "completion_length": 120.84375, "epoch": 2.794266441821248, "grad_norm": 8.033105157926789, "kl": 0.546875, "learning_rate": 4.418212478920742e-07, "loss": 0.0005, "reward": 3.0550079345703125, "reward_std": 0.19160734862089157, "rewards/final_reward": 0.37046362232971186, "rewards/mask_iou_reward": 0.18523181116485593, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0550077557563782, "rewards/thk_ans_format_reward": 1.0, "step": 1655, "think_completion_length": 53.375 }, { "clip_ratio": 0.0, "completion_length": 184.90625, "epoch": 2.7959527824620576, "grad_norm": 5.009021405486533, "kl": 0.548828125, "learning_rate": 4.4148397976391227e-07, "loss": 0.0006, "reward": 3.5861432552337646, "reward_std": 0.08710538037121296, "rewards/final_reward": 1.5525587463652795, "rewards/mask_iou_reward": 0.7762793731826397, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5861433148384094, "rewards/thk_ans_format_reward": 1.0, "step": 1656, "think_completion_length": 42.28125 }, { "clip_ratio": 0.0, "completion_length": 143.96875, "epoch": 2.7976391231028668, "grad_norm": 8.854949474945357, "kl": 0.5, "learning_rate": 4.411467116357504e-07, "loss": 0.0005, "reward": 3.54121994972229, "reward_std": 0.09857543557882309, "rewards/final_reward": 1.4392045951501446, "rewards/mask_iou_reward": 0.7196022975750723, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5412198901176453, "rewards/thk_ans_format_reward": 1.0, "step": 1657, "think_completion_length": 46.4375 }, { "clip_ratio": 0.0, "completion_length": 130.0625, "epoch": 2.799325463743676, "grad_norm": 12.518744855651851, "kl": 0.5625, "learning_rate": 4.4080944350758855e-07, "loss": 0.0006, "reward": 3.2827905416488647, "reward_std": 0.27441447228193283, "rewards/final_reward": 1.0464232900290225, "rewards/mask_iou_reward": 0.5232116450145112, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2827905416488647, "rewards/thk_ans_format_reward": 1.0, "step": 1658, "think_completion_length": 43.5625 }, { "clip_ratio": 0.0, "completion_length": 120.53125, "epoch": 2.8010118043844856, "grad_norm": 7.398113473881379, "kl": 0.568359375, "learning_rate": 4.404721753794266e-07, "loss": 0.0006, "reward": 3.1118181943893433, "reward_std": 0.09089295193552971, "rewards/final_reward": 1.056219472565322, "rewards/mask_iou_reward": 0.528109736282661, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1118182241916656, "rewards/thk_ans_format_reward": 1.0, "step": 1659, "think_completion_length": 51.65625 }, { "clip_ratio": 0.0, "completion_length": 114.953125, "epoch": 2.8026981450252952, "grad_norm": 4.592629735441871, "kl": 0.515625, "learning_rate": 4.401349072512647e-07, "loss": 0.0005, "reward": 3.5405365228652954, "reward_std": 0.058989531360566616, "rewards/final_reward": 1.421891593709226, "rewards/mask_iou_reward": 0.710945796854613, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5405365824699402, "rewards/thk_ans_format_reward": 1.0, "step": 1660, "think_completion_length": 47.84375 }, { "clip_ratio": 0.0, "completion_length": 120.921875, "epoch": 2.8043844856661044, "grad_norm": 8.301478140894433, "kl": 0.56640625, "learning_rate": 4.3979763912310286e-07, "loss": 0.0006, "reward": 3.2338669300079346, "reward_std": 0.20402198284864426, "rewards/final_reward": 1.4756446384310582, "rewards/mask_iou_reward": 0.7378223192155291, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2338669896125793, "rewards/thk_ans_format_reward": 1.0, "step": 1661, "think_completion_length": 50.46875 }, { "clip_ratio": 0.0, "completion_length": 121.0, "epoch": 2.806070826306914, "grad_norm": 15.009090092095851, "kl": 0.517578125, "learning_rate": 4.3946037099494095e-07, "loss": 0.0005, "reward": 3.4566744565963745, "reward_std": 0.1600627675652504, "rewards/final_reward": 1.6646250918542926, "rewards/mask_iou_reward": 0.8323125459271463, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4566745162010193, "rewards/thk_ans_format_reward": 1.0, "step": 1662, "think_completion_length": 48.78125 }, { "clip_ratio": 0.0, "completion_length": 114.125, "epoch": 2.8077571669477237, "grad_norm": 7.411943859075892, "kl": 0.537109375, "learning_rate": 4.3912310286677904e-07, "loss": 0.0005, "reward": 3.2928357124328613, "reward_std": 0.04496446065604687, "rewards/final_reward": 1.0907731326858414, "rewards/mask_iou_reward": 0.5453865663429207, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.292835682630539, "rewards/thk_ans_format_reward": 1.0, "step": 1663, "think_completion_length": 45.4375 }, { "clip_ratio": 0.0, "completion_length": 152.0, "epoch": 2.809443507588533, "grad_norm": 5.522931760133354, "kl": 0.693359375, "learning_rate": 4.387858347386172e-07, "loss": 0.0007, "reward": 2.4156452417373657, "reward_std": 0.27752088755369186, "rewards/final_reward": 0.5370063073601792, "rewards/mask_iou_reward": 0.2685031536800896, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.41564518958330154, "rewards/thk_ans_format_reward": 1.0, "step": 1664, "think_completion_length": 46.09375 }, { "clip_ratio": 0.0, "completion_length": 117.96875, "epoch": 2.811129848229342, "grad_norm": 12.48393400789474, "kl": 0.8046875, "learning_rate": 4.384485666104553e-07, "loss": 0.0008, "reward": 3.167549252510071, "reward_std": 0.1540435515344143, "rewards/final_reward": 1.2255837292741503, "rewards/mask_iou_reward": 0.6127918646370751, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1675493121147156, "rewards/thk_ans_format_reward": 1.0, "step": 1665, "think_completion_length": 43.15625 }, { "clip_ratio": 0.0, "completion_length": 120.484375, "epoch": 2.8128161888701517, "grad_norm": 7.842676773141203, "kl": 0.54296875, "learning_rate": 4.381112984822934e-07, "loss": 0.0003, "reward": 3.243940830230713, "reward_std": 0.0764221902936697, "rewards/final_reward": 1.1349538626751443, "rewards/mask_iou_reward": 0.5674769313375722, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.243940830230713, "rewards/thk_ans_format_reward": 1.0, "step": 1666, "think_completion_length": 52.3125 }, { "clip_ratio": 0.0, "completion_length": 116.265625, "epoch": 2.8145025295109614, "grad_norm": 7.420756090615668, "kl": 0.802734375, "learning_rate": 4.377740303541315e-07, "loss": 0.0008, "reward": 3.0553218126296997, "reward_std": 0.1333809532225132, "rewards/final_reward": 1.877014424743797, "rewards/mask_iou_reward": 0.9385072123718985, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0553217232227325, "rewards/thk_ans_format_reward": 1.0, "step": 1667, "think_completion_length": 49.15625 }, { "clip_ratio": 0.0, "completion_length": 123.921875, "epoch": 2.8161888701517706, "grad_norm": 20.9164669985359, "kl": 0.57421875, "learning_rate": 4.3743676222596963e-07, "loss": 0.0006, "reward": 3.2621958255767822, "reward_std": 0.10390813648700714, "rewards/final_reward": 1.5714543200341808, "rewards/mask_iou_reward": 0.7857271600170904, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.262195885181427, "rewards/thk_ans_format_reward": 1.0, "step": 1668, "think_completion_length": 54.9375 }, { "clip_ratio": 0.0, "completion_length": 129.46875, "epoch": 2.81787521079258, "grad_norm": 11.46581781710695, "kl": 0.5087890625, "learning_rate": 4.370994940978077e-07, "loss": 0.0005, "reward": 3.0988789796829224, "reward_std": 0.16297003626823425, "rewards/final_reward": 0.3341480251757681, "rewards/mask_iou_reward": 0.16707401258788404, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0988790392875671, "rewards/thk_ans_format_reward": 1.0, "step": 1669, "think_completion_length": 49.8125 }, { "clip_ratio": 0.0, "completion_length": 115.640625, "epoch": 2.8195615514333894, "grad_norm": 12.987290901235163, "kl": 0.533203125, "learning_rate": 4.3676222596964586e-07, "loss": 0.0006, "reward": 2.8739062547683716, "reward_std": 0.10603522881865501, "rewards/final_reward": 0.9522916842171223, "rewards/mask_iou_reward": 0.47614584210856115, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8739061681553721, "rewards/thk_ans_format_reward": 1.0, "step": 1670, "think_completion_length": 45.6875 }, { "clip_ratio": 0.0, "completion_length": 131.015625, "epoch": 2.821247892074199, "grad_norm": 6.2920481506061385, "kl": 0.48046875, "learning_rate": 4.3642495784148395e-07, "loss": 0.0005, "reward": 3.123382806777954, "reward_std": 0.05944320000708103, "rewards/final_reward": 1.2645310449635334, "rewards/mask_iou_reward": 0.6322655224817667, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1233826875686646, "rewards/thk_ans_format_reward": 1.0, "step": 1671, "think_completion_length": 52.90625 }, { "clip_ratio": 0.0, "completion_length": 174.15625, "epoch": 2.822934232715008, "grad_norm": 4.193921693846766, "kl": 0.609375, "learning_rate": 4.3608768971332204e-07, "loss": 0.0006, "reward": 3.4360289573669434, "reward_std": 0.1628253385424614, "rewards/final_reward": 1.405196945674128, "rewards/mask_iou_reward": 0.702598472837064, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4360288381576538, "rewards/thk_ans_format_reward": 1.0, "step": 1672, "think_completion_length": 49.4375 }, { "clip_ratio": 0.0, "completion_length": 136.3125, "epoch": 2.824620573355818, "grad_norm": 13.421029881471101, "kl": 0.67578125, "learning_rate": 4.357504215851602e-07, "loss": 0.0007, "reward": 3.248274564743042, "reward_std": 0.026005716295912862, "rewards/final_reward": 1.0280415642752836, "rewards/mask_iou_reward": 0.5140207821376418, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2482746243476868, "rewards/thk_ans_format_reward": 1.0, "step": 1673, "think_completion_length": 47.59375 }, { "clip_ratio": 0.0, "completion_length": 216.109375, "epoch": 2.8263069139966275, "grad_norm": 13.271196275319467, "kl": 0.484375, "learning_rate": 4.354131534569983e-07, "loss": 0.0005, "reward": 2.8315566778182983, "reward_std": 0.30540551617741585, "rewards/final_reward": 0.9391927953720351, "rewards/mask_iou_reward": 0.4695963976860176, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 0.894056499004364, "rewards/thk_ans_format_reward": 0.96875, "step": 1674, "think_completion_length": 47.0 }, { "clip_ratio": 0.0, "completion_length": 118.90625, "epoch": 2.8279932546374367, "grad_norm": 8.455936572709941, "kl": 0.552734375, "learning_rate": 4.3507588532883635e-07, "loss": 0.0006, "reward": 3.7103850841522217, "reward_std": 0.12102552060969174, "rewards/final_reward": 1.8495349146417959, "rewards/mask_iou_reward": 0.9247674573208979, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.710385262966156, "rewards/thk_ans_format_reward": 1.0, "step": 1675, "think_completion_length": 49.59375 }, { "clip_ratio": 0.0, "completion_length": 164.390625, "epoch": 2.8296795952782463, "grad_norm": 23.85193166056514, "kl": 0.4775390625, "learning_rate": 4.347386172006745e-07, "loss": 0.0005, "reward": 3.2892091274261475, "reward_std": 0.2604014202952385, "rewards/final_reward": 1.5074901698516783, "rewards/mask_iou_reward": 0.7537450849258391, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2892090678215027, "rewards/thk_ans_format_reward": 1.0, "step": 1676, "think_completion_length": 47.78125 }, { "clip_ratio": 0.0, "completion_length": 118.25, "epoch": 2.8313659359190555, "grad_norm": 5.168884073555499, "kl": 0.56640625, "learning_rate": 4.3440134907251263e-07, "loss": 0.0006, "reward": 3.3190125226974487, "reward_std": 0.20287639647722244, "rewards/final_reward": 1.5806630389857197, "rewards/mask_iou_reward": 0.7903315194928598, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3190125823020935, "rewards/thk_ans_format_reward": 1.0, "step": 1677, "think_completion_length": 48.53125 }, { "clip_ratio": 0.0, "completion_length": 118.6875, "epoch": 2.833052276559865, "grad_norm": 92.17630352695905, "kl": 0.576171875, "learning_rate": 4.3406408094435077e-07, "loss": 0.0006, "reward": 3.4132070541381836, "reward_std": 0.11548706330358982, "rewards/final_reward": 1.3225807783636303, "rewards/mask_iou_reward": 0.6612903891818152, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4132071137428284, "rewards/thk_ans_format_reward": 1.0, "step": 1678, "think_completion_length": 51.5 }, { "clip_ratio": 0.0, "completion_length": 121.0625, "epoch": 2.8347386172006743, "grad_norm": 9.76328541975843, "kl": 0.537109375, "learning_rate": 4.3372681281618886e-07, "loss": 0.0005, "reward": 2.71964168548584, "reward_std": 0.3583778738975525, "rewards/final_reward": 0.7354924743235481, "rewards/mask_iou_reward": 0.36774623716177407, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7196417450904846, "rewards/thk_ans_format_reward": 1.0, "step": 1679, "think_completion_length": 48.84375 }, { "clip_ratio": 0.0, "completion_length": 122.71875, "epoch": 2.836424957841484, "grad_norm": 18.884921118766233, "kl": 0.546875, "learning_rate": 4.3338954468802695e-07, "loss": 0.0005, "reward": 3.5863040685653687, "reward_std": 0.16526619624346495, "rewards/final_reward": 1.5597678148528689, "rewards/mask_iou_reward": 0.7798839074264344, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5863041877746582, "rewards/thk_ans_format_reward": 1.0, "step": 1680, "think_completion_length": 49.6875 }, { "clip_ratio": 0.0, "completion_length": 191.421875, "epoch": 2.8381112984822936, "grad_norm": 5.336722180196324, "kl": 0.466796875, "learning_rate": 4.330522765598651e-07, "loss": 0.0005, "reward": 3.214847207069397, "reward_std": 0.12801437883172184, "rewards/final_reward": 1.580831129811842, "rewards/mask_iou_reward": 0.790415564905921, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.214847207069397, "rewards/thk_ans_format_reward": 1.0, "step": 1681, "think_completion_length": 52.375 }, { "clip_ratio": 0.0, "completion_length": 137.328125, "epoch": 2.839797639123103, "grad_norm": 5.561161774358996, "kl": 0.474609375, "learning_rate": 4.327150084317032e-07, "loss": 0.0005, "reward": 3.457669496536255, "reward_std": 0.13381371274590492, "rewards/final_reward": 1.054481489107785, "rewards/mask_iou_reward": 0.5272407445538925, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4576694965362549, "rewards/thk_ans_format_reward": 1.0, "step": 1682, "think_completion_length": 51.65625 }, { "clip_ratio": 0.0, "completion_length": 121.75, "epoch": 2.8414839797639124, "grad_norm": 8.289504941062013, "kl": 0.525390625, "learning_rate": 4.323777403035413e-07, "loss": 0.0005, "reward": 3.3733558654785156, "reward_std": 0.2661462351679802, "rewards/final_reward": 1.5664154016541394, "rewards/mask_iou_reward": 0.7832077008270697, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 1.40460604429245, "rewards/thk_ans_format_reward": 1.0, "step": 1683, "think_completion_length": 49.375 }, { "clip_ratio": 0.0, "completion_length": 130.140625, "epoch": 2.8431703204047216, "grad_norm": 11.426186675709356, "kl": 0.482421875, "learning_rate": 4.320404721753794e-07, "loss": 0.0005, "reward": 3.723689556121826, "reward_std": 0.08870341628789902, "rewards/final_reward": 1.8753199432935024, "rewards/mask_iou_reward": 0.9376599716467512, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7236894965171814, "rewards/thk_ans_format_reward": 1.0, "step": 1684, "think_completion_length": 52.71875 }, { "clip_ratio": 0.0, "completion_length": 125.125, "epoch": 2.8448566610455313, "grad_norm": 12.582730794114397, "kl": 0.5380859375, "learning_rate": 4.317032040472175e-07, "loss": 0.0005, "reward": 2.6829700469970703, "reward_std": 0.24802841991186142, "rewards/final_reward": 0.32948334050492945, "rewards/mask_iou_reward": 0.16474167025246472, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6829699873924255, "rewards/thk_ans_format_reward": 1.0, "step": 1685, "think_completion_length": 47.71875 }, { "clip_ratio": 0.0, "completion_length": 130.078125, "epoch": 2.8465430016863404, "grad_norm": 12.427712787402632, "kl": 0.654296875, "learning_rate": 4.3136593591905563e-07, "loss": 0.0007, "reward": 3.4638320207595825, "reward_std": 0.2700415402650833, "rewards/final_reward": 1.4534286260477245, "rewards/mask_iou_reward": 0.7267143130238622, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4638320207595825, "rewards/thk_ans_format_reward": 1.0, "step": 1686, "think_completion_length": 55.03125 }, { "clip_ratio": 0.0, "completion_length": 122.15625, "epoch": 2.84822934232715, "grad_norm": 11.38645102336239, "kl": 0.541015625, "learning_rate": 4.3102866779089377e-07, "loss": 0.0005, "reward": 3.382949709892273, "reward_std": 0.3745395615696907, "rewards/final_reward": 1.6297009222230234, "rewards/mask_iou_reward": 0.8148504611115117, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.382949709892273, "rewards/thk_ans_format_reward": 1.0, "step": 1687, "think_completion_length": 54.15625 }, { "clip_ratio": 0.0, "completion_length": 119.234375, "epoch": 2.8499156829679597, "grad_norm": 16.14129074118289, "kl": 0.537109375, "learning_rate": 4.306913996627318e-07, "loss": 0.0005, "reward": 3.3319497108459473, "reward_std": 0.0831929137930274, "rewards/final_reward": 1.1821064456849155, "rewards/mask_iou_reward": 0.5910532228424578, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3319497108459473, "rewards/thk_ans_format_reward": 1.0, "step": 1688, "think_completion_length": 48.75 }, { "clip_ratio": 0.0, "completion_length": 124.625, "epoch": 2.851602023608769, "grad_norm": 103.81113268417846, "kl": 0.57421875, "learning_rate": 4.3035413153456995e-07, "loss": 0.0006, "reward": 3.1938982009887695, "reward_std": 0.09463486075401306, "rewards/final_reward": 0.8637187013740543, "rewards/mask_iou_reward": 0.43185935068702713, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.19389808177948, "rewards/thk_ans_format_reward": 1.0, "step": 1689, "think_completion_length": 46.28125 }, { "clip_ratio": 0.0, "completion_length": 121.734375, "epoch": 2.8532883642495785, "grad_norm": 18.93569565214868, "kl": 0.5244140625, "learning_rate": 4.300168634064081e-07, "loss": 0.0005, "reward": 2.992497682571411, "reward_std": 0.07934637367725372, "rewards/final_reward": 0.7805767257267343, "rewards/mask_iou_reward": 0.39028836286336716, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9924976080656052, "rewards/thk_ans_format_reward": 1.0, "step": 1690, "think_completion_length": 51.59375 }, { "clip_ratio": 0.0, "completion_length": 142.84375, "epoch": 2.8549747048903877, "grad_norm": 6.227094648425705, "kl": 0.5849609375, "learning_rate": 4.2967959527824623e-07, "loss": 0.0006, "reward": 3.5347208976745605, "reward_std": 0.02190372860059142, "rewards/final_reward": 1.175894394798423, "rewards/mask_iou_reward": 0.5879471973992115, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.534720778465271, "rewards/thk_ans_format_reward": 1.0, "step": 1691, "think_completion_length": 46.15625 }, { "clip_ratio": 0.0, "completion_length": 116.796875, "epoch": 2.8566610455311974, "grad_norm": 15.977781215682915, "kl": 0.57421875, "learning_rate": 4.2934232715008426e-07, "loss": 0.0006, "reward": 3.471309781074524, "reward_std": 0.12639077939093113, "rewards/final_reward": 1.8368525076066613, "rewards/mask_iou_reward": 0.9184262538033306, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.471309781074524, "rewards/thk_ans_format_reward": 1.0, "step": 1692, "think_completion_length": 45.0 }, { "clip_ratio": 0.0, "completion_length": 115.421875, "epoch": 2.8583473861720066, "grad_norm": 17.817364501759883, "kl": 1.21875, "learning_rate": 4.290050590219224e-07, "loss": 0.0012, "reward": 3.349412679672241, "reward_std": 0.02023978717625141, "rewards/final_reward": 1.6064038112818841, "rewards/mask_iou_reward": 0.8032019056409421, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3494127988815308, "rewards/thk_ans_format_reward": 1.0, "step": 1693, "think_completion_length": 45.40625 }, { "clip_ratio": 0.0, "completion_length": 117.828125, "epoch": 2.860033726812816, "grad_norm": 11.180486783020493, "kl": 0.546875, "learning_rate": 4.2866779089376054e-07, "loss": 0.0005, "reward": 3.344439148902893, "reward_std": 0.05913896486163139, "rewards/final_reward": 1.2388363326863248, "rewards/mask_iou_reward": 0.6194181663431624, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3444392085075378, "rewards/thk_ans_format_reward": 1.0, "step": 1694, "think_completion_length": 46.1875 }, { "clip_ratio": 0.0, "completion_length": 118.796875, "epoch": 2.861720067453626, "grad_norm": 7.891941279533489, "kl": 0.5078125, "learning_rate": 4.2833052276559863e-07, "loss": 0.0005, "reward": 3.255765199661255, "reward_std": 0.1523425281047821, "rewards/final_reward": 1.1536836657640526, "rewards/mask_iou_reward": 0.5768418328820263, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2557652592658997, "rewards/thk_ans_format_reward": 1.0, "step": 1695, "think_completion_length": 50.0625 }, { "clip_ratio": 0.0, "completion_length": 119.34375, "epoch": 2.863406408094435, "grad_norm": 4.954447968205195, "kl": 0.697265625, "learning_rate": 4.279932546374367e-07, "loss": 0.0007, "reward": 3.497539758682251, "reward_std": 0.04229480121284723, "rewards/final_reward": 1.5405165870513375, "rewards/mask_iou_reward": 0.7702582935256688, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4975398182868958, "rewards/thk_ans_format_reward": 1.0, "step": 1696, "think_completion_length": 45.9375 }, { "clip_ratio": 0.0, "completion_length": 119.359375, "epoch": 2.8650927487352447, "grad_norm": 10.248081433569746, "kl": 0.529296875, "learning_rate": 4.2765598650927486e-07, "loss": 0.0005, "reward": 3.390744209289551, "reward_std": 0.09942889865487814, "rewards/final_reward": 0.9318678485841465, "rewards/mask_iou_reward": 0.46593392429207325, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3907442688941956, "rewards/thk_ans_format_reward": 1.0, "step": 1697, "think_completion_length": 51.84375 }, { "clip_ratio": 0.0, "completion_length": 120.671875, "epoch": 2.866779089376054, "grad_norm": 19.360336118338246, "kl": 0.5625, "learning_rate": 4.2731871838111294e-07, "loss": 0.0006, "reward": 3.254174590110779, "reward_std": 0.34707289934158325, "rewards/final_reward": 0.8415739413701474, "rewards/mask_iou_reward": 0.4207869706850737, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2541745901107788, "rewards/thk_ans_format_reward": 1.0, "step": 1698, "think_completion_length": 53.5625 }, { "clip_ratio": 0.0, "completion_length": 125.890625, "epoch": 2.8684654300168635, "grad_norm": 17.160014335485958, "kl": 1.837890625, "learning_rate": 4.269814502529511e-07, "loss": 0.0018, "reward": 3.398214817047119, "reward_std": 0.12190900649875402, "rewards/final_reward": 1.4416676597779143, "rewards/mask_iou_reward": 0.7208338298889572, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3982148170471191, "rewards/thk_ans_format_reward": 1.0, "step": 1699, "think_completion_length": 46.78125 }, { "clip_ratio": 0.0, "completion_length": 188.765625, "epoch": 2.8701517706576727, "grad_norm": 5.664639260401403, "kl": 0.490234375, "learning_rate": 4.2664418212478917e-07, "loss": 0.0005, "reward": 3.334146022796631, "reward_std": 0.04577235411852598, "rewards/final_reward": 1.062847271016929, "rewards/mask_iou_reward": 0.5314236355084645, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3341458439826965, "rewards/thk_ans_format_reward": 1.0, "step": 1700, "think_completion_length": 48.78125 }, { "clip_ratio": 0.0, "completion_length": 206.515625, "epoch": 2.8718381112984823, "grad_norm": 6.177565453121126, "kl": 0.4443359375, "learning_rate": 4.263069139966273e-07, "loss": 0.0004, "reward": 3.4637255668640137, "reward_std": 0.09016487468034029, "rewards/final_reward": 1.1509884523194014, "rewards/mask_iou_reward": 0.5754942261597007, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4637253284454346, "rewards/thk_ans_format_reward": 1.0, "step": 1701, "think_completion_length": 43.0 }, { "clip_ratio": 0.0, "completion_length": 172.796875, "epoch": 2.873524451939292, "grad_norm": 7.6573470902159775, "kl": 0.4873046875, "learning_rate": 4.259696458684654e-07, "loss": 0.0005, "reward": 3.3190892934799194, "reward_std": 0.1383841149508953, "rewards/final_reward": 0.7692882700656908, "rewards/mask_iou_reward": 0.3846441350328454, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3190893530845642, "rewards/thk_ans_format_reward": 1.0, "step": 1702, "think_completion_length": 55.59375 }, { "clip_ratio": 0.0, "completion_length": 119.96875, "epoch": 2.875210792580101, "grad_norm": 16.530129336864004, "kl": 0.55859375, "learning_rate": 4.2563237774030354e-07, "loss": 0.0006, "reward": 3.119685411453247, "reward_std": 0.1856657639145851, "rewards/final_reward": 0.9465999923214892, "rewards/mask_iou_reward": 0.4732999961607446, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1196853816509247, "rewards/thk_ans_format_reward": 1.0, "step": 1703, "think_completion_length": 48.6875 }, { "clip_ratio": 0.0, "completion_length": 128.609375, "epoch": 2.876897133220911, "grad_norm": 5.611754103632963, "kl": 0.587890625, "learning_rate": 4.252951096121417e-07, "loss": 0.0006, "reward": 3.011402726173401, "reward_std": 0.017416599672287703, "rewards/final_reward": 0.44855465136059125, "rewards/mask_iou_reward": 0.22427732568029562, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0114028453826904, "rewards/thk_ans_format_reward": 1.0, "step": 1704, "think_completion_length": 52.15625 }, { "clip_ratio": 0.0, "completion_length": 118.40625, "epoch": 2.87858347386172, "grad_norm": 10.274447825876113, "kl": 0.708984375, "learning_rate": 4.249578414839797e-07, "loss": 0.0007, "reward": 3.3437150716781616, "reward_std": 0.1217598095536232, "rewards/final_reward": 1.7857162257774482, "rewards/mask_iou_reward": 0.8928581128887241, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3437150716781616, "rewards/thk_ans_format_reward": 1.0, "step": 1705, "think_completion_length": 45.34375 }, { "clip_ratio": 0.0, "completion_length": 118.6875, "epoch": 2.8802698145025296, "grad_norm": 5.036390750197868, "kl": 0.564453125, "learning_rate": 4.2462057335581786e-07, "loss": 0.0006, "reward": 3.1259138584136963, "reward_std": 0.014334550127387047, "rewards/final_reward": 1.2241493610339746, "rewards/mask_iou_reward": 0.6120746805169873, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1259138584136963, "rewards/thk_ans_format_reward": 1.0, "step": 1706, "think_completion_length": 45.3125 }, { "clip_ratio": 0.0, "completion_length": 122.4375, "epoch": 2.881956155143339, "grad_norm": 14.405343265668892, "kl": 0.505859375, "learning_rate": 4.24283305227656e-07, "loss": 0.0005, "reward": 3.0240617990493774, "reward_std": 0.11181307956576347, "rewards/final_reward": 1.4497086806740294, "rewards/mask_iou_reward": 0.7248543403370147, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0240616202354431, "rewards/thk_ans_format_reward": 1.0, "step": 1707, "think_completion_length": 51.125 }, { "clip_ratio": 0.0, "completion_length": 155.125, "epoch": 2.8836424957841484, "grad_norm": 8.712780147567514, "kl": 0.537109375, "learning_rate": 4.239460370994941e-07, "loss": 0.0005, "reward": 3.6914472579956055, "reward_std": 0.13960205670446157, "rewards/final_reward": 1.730837492291958, "rewards/mask_iou_reward": 0.865418746145979, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6914473176002502, "rewards/thk_ans_format_reward": 1.0, "step": 1708, "think_completion_length": 46.28125 }, { "clip_ratio": 0.0, "completion_length": 162.75, "epoch": 2.885328836424958, "grad_norm": 119.6328886128793, "kl": 0.6796875, "learning_rate": 4.2360876897133217e-07, "loss": 0.0007, "reward": 3.302128791809082, "reward_std": 0.04586852062493563, "rewards/final_reward": 0.9640497744462393, "rewards/mask_iou_reward": 0.48202488722311965, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3021288514137268, "rewards/thk_ans_format_reward": 1.0, "step": 1709, "think_completion_length": 54.0625 }, { "clip_ratio": 0.0, "completion_length": 121.609375, "epoch": 2.8870151770657673, "grad_norm": 10.790821051390031, "kl": 0.642578125, "learning_rate": 4.232715008431703e-07, "loss": 0.0006, "reward": 3.2432212829589844, "reward_std": 0.25815831683576107, "rewards/final_reward": 1.198961451519287, "rewards/mask_iou_reward": 0.5994807257596435, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2432212829589844, "rewards/thk_ans_format_reward": 1.0, "step": 1710, "think_completion_length": 56.46875 }, { "clip_ratio": 0.0, "completion_length": 176.765625, "epoch": 2.8887015177065765, "grad_norm": 8.464795399420126, "kl": 0.5009765625, "learning_rate": 4.229342327150084e-07, "loss": 0.0005, "reward": 3.3584113121032715, "reward_std": 0.09209583140909672, "rewards/final_reward": 1.1509888312868992, "rewards/mask_iou_reward": 0.5754944156434496, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3584113717079163, "rewards/thk_ans_format_reward": 1.0, "step": 1711, "think_completion_length": 47.65625 }, { "clip_ratio": 0.0, "completion_length": 140.078125, "epoch": 2.890387858347386, "grad_norm": 9.531788486775692, "kl": 0.501953125, "learning_rate": 4.2259696458684654e-07, "loss": 0.0005, "reward": 3.5532913208007812, "reward_std": 0.10277672484517097, "rewards/final_reward": 1.6036280466363646, "rewards/mask_iou_reward": 0.8018140233181823, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5532912611961365, "rewards/thk_ans_format_reward": 1.0, "step": 1712, "think_completion_length": 47.6875 }, { "clip_ratio": 0.0, "completion_length": 146.796875, "epoch": 2.8920741989881957, "grad_norm": 24.107209669435377, "kl": 0.4765625, "learning_rate": 4.222596964586846e-07, "loss": 0.0005, "reward": 3.5580878257751465, "reward_std": 0.05756748793646693, "rewards/final_reward": 1.4146137485463521, "rewards/mask_iou_reward": 0.7073068742731761, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.558087944984436, "rewards/thk_ans_format_reward": 1.0, "step": 1713, "think_completion_length": 47.0 }, { "clip_ratio": 0.0, "completion_length": 171.09375, "epoch": 2.893760539629005, "grad_norm": 7.758736439924713, "kl": 0.533203125, "learning_rate": 4.2192242833052277e-07, "loss": 0.0005, "reward": 3.5002644062042236, "reward_std": 0.2005770057439804, "rewards/final_reward": 1.2092635447748536, "rewards/mask_iou_reward": 0.6046317723874268, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5002645254135132, "rewards/thk_ans_format_reward": 1.0, "step": 1714, "think_completion_length": 46.09375 }, { "clip_ratio": 0.0, "completion_length": 122.9375, "epoch": 2.8954468802698146, "grad_norm": 6.39981184493797, "kl": 0.658203125, "learning_rate": 4.2158516020236085e-07, "loss": 0.0007, "reward": 3.34832501411438, "reward_std": 0.07436983287334442, "rewards/final_reward": 1.7210072380912143, "rewards/mask_iou_reward": 0.8605036190456071, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3483251929283142, "rewards/thk_ans_format_reward": 1.0, "step": 1715, "think_completion_length": 44.40625 }, { "clip_ratio": 0.0, "completion_length": 117.65625, "epoch": 2.897133220910624, "grad_norm": 6.109057908741958, "kl": 0.521484375, "learning_rate": 4.21247892074199e-07, "loss": 0.0005, "reward": 3.4417446851730347, "reward_std": 0.007995732361450791, "rewards/final_reward": 0.9722025745398519, "rewards/mask_iou_reward": 0.48610128726992596, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4417446851730347, "rewards/thk_ans_format_reward": 1.0, "step": 1716, "think_completion_length": 45.59375 }, { "clip_ratio": 0.0, "completion_length": 112.453125, "epoch": 2.8988195615514334, "grad_norm": 8.108779936322028, "kl": 0.693359375, "learning_rate": 4.209106239460371e-07, "loss": 0.0007, "reward": 3.0282063484191895, "reward_std": 0.29450612515211105, "rewards/final_reward": 0.9873438877122154, "rewards/mask_iou_reward": 0.4936719438561077, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0282064378261566, "rewards/thk_ans_format_reward": 1.0, "step": 1717, "think_completion_length": 51.25 }, { "clip_ratio": 0.0, "completion_length": 155.9375, "epoch": 2.9005059021922426, "grad_norm": 7.526311247628863, "kl": 0.5322265625, "learning_rate": 4.2057335581787517e-07, "loss": 0.0005, "reward": 3.740854859352112, "reward_std": 0.03472239035181701, "rewards/final_reward": 1.7201866005635802, "rewards/mask_iou_reward": 0.8600933002817901, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7408548593521118, "rewards/thk_ans_format_reward": 1.0, "step": 1718, "think_completion_length": 45.6875 }, { "clip_ratio": 0.0, "completion_length": 140.375, "epoch": 2.902192242833052, "grad_norm": 17.340575174288656, "kl": 0.560546875, "learning_rate": 4.202360876897133e-07, "loss": 0.0006, "reward": 3.252333164215088, "reward_std": 0.09733846783638, "rewards/final_reward": 1.877978983533691, "rewards/mask_iou_reward": 0.9389894917668455, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.252333164215088, "rewards/thk_ans_format_reward": 1.0, "step": 1719, "think_completion_length": 47.40625 }, { "clip_ratio": 0.0, "completion_length": 173.09375, "epoch": 2.903878583473862, "grad_norm": 11.916990798274322, "kl": 0.525390625, "learning_rate": 4.1989881956155145e-07, "loss": 0.0005, "reward": 3.2642595767974854, "reward_std": 0.3322697635740042, "rewards/final_reward": 1.6571951719452607, "rewards/mask_iou_reward": 0.8285975859726303, "rewards/sam_format_reward": 0.953125, "rewards/sam_reward_func_ultra": 1.3580095767974854, "rewards/thk_ans_format_reward": 0.953125, "step": 1720, "think_completion_length": 49.15625 }, { "clip_ratio": 0.0, "completion_length": 128.28125, "epoch": 2.905564924114671, "grad_norm": 60.51395728112135, "kl": 0.578125, "learning_rate": 4.195615514333895e-07, "loss": 0.0005, "reward": 3.8474960327148438, "reward_std": 0.09723218204453588, "rewards/final_reward": 1.8935047756335932, "rewards/mask_iou_reward": 0.9467523878167966, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8474960327148438, "rewards/thk_ans_format_reward": 1.0, "step": 1721, "think_completion_length": 48.0 }, { "clip_ratio": 0.0, "completion_length": 134.96875, "epoch": 2.9072512647554807, "grad_norm": 6.1129558108301865, "kl": 0.5703125, "learning_rate": 4.192242833052276e-07, "loss": 0.0005, "reward": 3.4524412155151367, "reward_std": 0.08695713616907597, "rewards/final_reward": 1.4480904777145365, "rewards/mask_iou_reward": 0.7240452388572682, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.452441155910492, "rewards/thk_ans_format_reward": 1.0, "step": 1722, "think_completion_length": 47.1875 }, { "clip_ratio": 0.0, "completion_length": 118.625, "epoch": 2.9089376053962903, "grad_norm": 12.061966103728397, "kl": 0.56640625, "learning_rate": 4.1888701517706576e-07, "loss": 0.0006, "reward": 3.232837438583374, "reward_std": 0.16182934492826462, "rewards/final_reward": 1.3887647567204833, "rewards/mask_iou_reward": 0.6943823783602416, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2328372597694397, "rewards/thk_ans_format_reward": 1.0, "step": 1723, "think_completion_length": 53.0 }, { "clip_ratio": 0.0, "completion_length": 115.84375, "epoch": 2.9106239460370995, "grad_norm": 9.271259487912298, "kl": 0.615234375, "learning_rate": 4.1854974704890385e-07, "loss": 0.0006, "reward": 3.0772303342819214, "reward_std": 0.19635407999157906, "rewards/final_reward": 1.4006259692580718, "rewards/mask_iou_reward": 0.7003129846290359, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0772303342819214, "rewards/thk_ans_format_reward": 1.0, "step": 1724, "think_completion_length": 46.1875 }, { "clip_ratio": 0.0, "completion_length": 136.78125, "epoch": 2.9123102866779087, "grad_norm": 14.68120288683905, "kl": 0.599609375, "learning_rate": 4.1821247892074194e-07, "loss": 0.0006, "reward": 3.557195544242859, "reward_std": 0.06366473622620106, "rewards/final_reward": 1.2397970681721528, "rewards/mask_iou_reward": 0.6198985340860764, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5571956038475037, "rewards/thk_ans_format_reward": 1.0, "step": 1725, "think_completion_length": 48.4375 }, { "clip_ratio": 0.0, "completion_length": 121.1875, "epoch": 2.9139966273187183, "grad_norm": 13.195233955342133, "kl": 0.533203125, "learning_rate": 4.178752107925801e-07, "loss": 0.0005, "reward": 3.4791808128356934, "reward_std": 0.0421409523114562, "rewards/final_reward": 1.5218060859800824, "rewards/mask_iou_reward": 0.7609030429900412, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.479180932044983, "rewards/thk_ans_format_reward": 1.0, "step": 1726, "think_completion_length": 52.53125 }, { "clip_ratio": 0.0, "completion_length": 118.171875, "epoch": 2.915682967959528, "grad_norm": 6.374675918777848, "kl": 0.56640625, "learning_rate": 4.175379426644182e-07, "loss": 0.0006, "reward": 3.0983314514160156, "reward_std": 0.128284377977252, "rewards/final_reward": 1.0925805377893214, "rewards/mask_iou_reward": 0.5462902688946607, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0983315706253052, "rewards/thk_ans_format_reward": 1.0, "step": 1727, "think_completion_length": 50.53125 }, { "clip_ratio": 0.0, "completion_length": 112.40625, "epoch": 2.917369308600337, "grad_norm": 11.56261552116547, "kl": 0.55078125, "learning_rate": 4.172006745362563e-07, "loss": 0.0006, "reward": 3.524160861968994, "reward_std": 0.06520858220756054, "rewards/final_reward": 1.3922147334798918, "rewards/mask_iou_reward": 0.6961073667399459, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5241608619689941, "rewards/thk_ans_format_reward": 1.0, "step": 1728, "think_completion_length": 41.0 }, { "clip_ratio": 0.0, "completion_length": 113.421875, "epoch": 2.919055649241147, "grad_norm": 11.645755711489283, "kl": 0.64453125, "learning_rate": 4.168634064080944e-07, "loss": 0.0006, "reward": 3.8355716466903687, "reward_std": 0.004892201977781951, "rewards/final_reward": 1.9233353356845395, "rewards/mask_iou_reward": 0.9616676678422698, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.835571825504303, "rewards/thk_ans_format_reward": 1.0, "step": 1729, "think_completion_length": 44.28125 }, { "clip_ratio": 0.0, "completion_length": 118.375, "epoch": 2.920741989881956, "grad_norm": 11.776779803616577, "kl": 0.564453125, "learning_rate": 4.1652613827993254e-07, "loss": 0.0006, "reward": 3.778007984161377, "reward_std": 0.0795029029250145, "rewards/final_reward": 1.8544093301855198, "rewards/mask_iou_reward": 0.9272046650927599, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.778007984161377, "rewards/thk_ans_format_reward": 1.0, "step": 1730, "think_completion_length": 51.71875 }, { "clip_ratio": 0.0, "completion_length": 132.578125, "epoch": 2.9224283305227656, "grad_norm": 55.579732230614546, "kl": 0.6328125, "learning_rate": 4.161888701517706e-07, "loss": 0.0006, "reward": 3.334317922592163, "reward_std": 0.12129126489162445, "rewards/final_reward": 1.2962656919236346, "rewards/mask_iou_reward": 0.6481328459618173, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3343179821968079, "rewards/thk_ans_format_reward": 1.0, "step": 1731, "think_completion_length": 45.40625 }, { "clip_ratio": 0.0, "completion_length": 120.234375, "epoch": 2.924114671163575, "grad_norm": 18.804690139625475, "kl": 0.66015625, "learning_rate": 4.1585160202360876e-07, "loss": 0.0007, "reward": 3.188036322593689, "reward_std": 0.13786154240369797, "rewards/final_reward": 1.4687809361757644, "rewards/mask_iou_reward": 0.7343904680878822, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.203661322593689, "rewards/thk_ans_format_reward": 1.0, "step": 1732, "think_completion_length": 51.15625 }, { "clip_ratio": 0.0, "completion_length": 122.1875, "epoch": 2.9258010118043845, "grad_norm": 18.20063333168867, "kl": 0.955078125, "learning_rate": 4.155143338954469e-07, "loss": 0.001, "reward": 3.518056869506836, "reward_std": 0.3514831140637398, "rewards/final_reward": 1.550659748466668, "rewards/mask_iou_reward": 0.775329874233334, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5180569291114807, "rewards/thk_ans_format_reward": 1.0, "step": 1733, "think_completion_length": 50.6875 }, { "clip_ratio": 0.0, "completion_length": 111.578125, "epoch": 2.927487352445194, "grad_norm": 39.32673591485834, "kl": 0.58203125, "learning_rate": 4.1517706576728494e-07, "loss": 0.0006, "reward": 3.3363709449768066, "reward_std": 0.06575199589133263, "rewards/final_reward": 1.2627733196875328, "rewards/mask_iou_reward": 0.6313866598437664, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3363710045814514, "rewards/thk_ans_format_reward": 1.0, "step": 1734, "think_completion_length": 40.84375 }, { "clip_ratio": 0.0, "completion_length": 145.984375, "epoch": 2.9291736930860033, "grad_norm": 7.815025623286234, "kl": 0.708984375, "learning_rate": 4.148397976391231e-07, "loss": 0.0007, "reward": 3.329333782196045, "reward_std": 0.27078694477677345, "rewards/final_reward": 1.246577492295484, "rewards/mask_iou_reward": 0.623288746147742, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.3605839014053345, "rewards/thk_ans_format_reward": 0.984375, "step": 1735, "think_completion_length": 41.71875 }, { "clip_ratio": 0.0, "completion_length": 125.59375, "epoch": 2.930860033726813, "grad_norm": 10.34660338793664, "kl": 0.576171875, "learning_rate": 4.145025295109612e-07, "loss": 0.0006, "reward": 3.636704921722412, "reward_std": 0.02508683316409588, "rewards/final_reward": 1.674860461087448, "rewards/mask_iou_reward": 0.837430230543724, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.636704921722412, "rewards/thk_ans_format_reward": 1.0, "step": 1736, "think_completion_length": 47.15625 }, { "clip_ratio": 0.0, "completion_length": 112.921875, "epoch": 2.932546374367622, "grad_norm": 15.646912978699548, "kl": 0.71484375, "learning_rate": 4.141652613827993e-07, "loss": 0.0007, "reward": 3.115605115890503, "reward_std": 0.25399264693260193, "rewards/final_reward": 1.125185816813323, "rewards/mask_iou_reward": 0.5625929084066615, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1156051754951477, "rewards/thk_ans_format_reward": 1.0, "step": 1737, "think_completion_length": 47.3125 }, { "clip_ratio": 0.0, "completion_length": 121.484375, "epoch": 2.9342327150084317, "grad_norm": 6.28798738746728, "kl": 0.578125, "learning_rate": 4.138279932546374e-07, "loss": 0.0006, "reward": 3.1532175540924072, "reward_std": 0.1299862286541611, "rewards/final_reward": 0.7318573581565188, "rewards/mask_iou_reward": 0.3659286790782594, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1532175540924072, "rewards/thk_ans_format_reward": 1.0, "step": 1738, "think_completion_length": 47.375 }, { "clip_ratio": 0.0, "completion_length": 108.296875, "epoch": 2.935919055649241, "grad_norm": 11.787626130142819, "kl": 0.630859375, "learning_rate": 4.1349072512647553e-07, "loss": 0.0006, "reward": 3.4893749952316284, "reward_std": 0.24706952273845673, "rewards/final_reward": 1.8051508827886227, "rewards/mask_iou_reward": 0.9025754413943113, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4893749952316284, "rewards/thk_ans_format_reward": 1.0, "step": 1739, "think_completion_length": 49.28125 }, { "clip_ratio": 0.0, "completion_length": 115.65625, "epoch": 2.9376053962900506, "grad_norm": 7.832585841665771, "kl": 0.72265625, "learning_rate": 4.131534569983137e-07, "loss": 0.0007, "reward": 3.078925609588623, "reward_std": 0.08913594763725996, "rewards/final_reward": 1.248415469949072, "rewards/mask_iou_reward": 0.624207734974536, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0789256691932678, "rewards/thk_ans_format_reward": 1.0, "step": 1740, "think_completion_length": 46.4375 }, { "clip_ratio": 0.0, "completion_length": 115.484375, "epoch": 2.93929173693086, "grad_norm": 16.07860759176254, "kl": 0.65625, "learning_rate": 4.1281618887015176e-07, "loss": 0.0007, "reward": 3.5517622232437134, "reward_std": 0.13729829341173172, "rewards/final_reward": 1.441280645538618, "rewards/mask_iou_reward": 0.720640322769309, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5517622232437134, "rewards/thk_ans_format_reward": 1.0, "step": 1741, "think_completion_length": 43.3125 }, { "clip_ratio": 0.0, "completion_length": 113.015625, "epoch": 2.9409780775716694, "grad_norm": 7.257305752944733, "kl": 0.5380859375, "learning_rate": 4.1247892074198985e-07, "loss": 0.0005, "reward": 3.3132989406585693, "reward_std": 0.2520294189453125, "rewards/final_reward": 1.6513091533377802, "rewards/mask_iou_reward": 0.8256545766688901, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3132989704608917, "rewards/thk_ans_format_reward": 1.0, "step": 1742, "think_completion_length": 41.625 }, { "clip_ratio": 0.0, "completion_length": 117.21875, "epoch": 2.942664418212479, "grad_norm": 7.221074397046372, "kl": 0.650390625, "learning_rate": 4.12141652613828e-07, "loss": 0.0006, "reward": 3.4602547883987427, "reward_std": 0.16891072317957878, "rewards/final_reward": 1.589578745008967, "rewards/mask_iou_reward": 0.7947893725044834, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4602547883987427, "rewards/thk_ans_format_reward": 1.0, "step": 1743, "think_completion_length": 45.1875 }, { "clip_ratio": 0.0, "completion_length": 116.0, "epoch": 2.9443507588532882, "grad_norm": 15.417697002872668, "kl": 0.572265625, "learning_rate": 4.118043844856661e-07, "loss": 0.0006, "reward": 3.5061349868774414, "reward_std": 0.15799922496080399, "rewards/final_reward": 1.4792684918120789, "rewards/mask_iou_reward": 0.7396342459060394, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5061350464820862, "rewards/thk_ans_format_reward": 1.0, "step": 1744, "think_completion_length": 46.125 }, { "clip_ratio": 0.0, "completion_length": 122.625, "epoch": 2.946037099494098, "grad_norm": 5.94093777546409, "kl": 0.59375, "learning_rate": 4.114671163575042e-07, "loss": 0.0006, "reward": 3.624622106552124, "reward_std": 0.04693530406802893, "rewards/final_reward": 1.4988867171201967, "rewards/mask_iou_reward": 0.7494433585600984, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6246221661567688, "rewards/thk_ans_format_reward": 1.0, "step": 1745, "think_completion_length": 43.09375 }, { "clip_ratio": 0.0, "completion_length": 115.15625, "epoch": 2.947723440134907, "grad_norm": 7.926401098507347, "kl": 0.59765625, "learning_rate": 4.111298482293423e-07, "loss": 0.0006, "reward": 3.3583085536956787, "reward_std": 0.22104869782924652, "rewards/final_reward": 1.0619630684221244, "rewards/mask_iou_reward": 0.5309815342110622, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3583085536956787, "rewards/thk_ans_format_reward": 1.0, "step": 1746, "think_completion_length": 46.71875 }, { "clip_ratio": 0.0, "completion_length": 133.765625, "epoch": 2.9494097807757167, "grad_norm": 6.545742287407206, "kl": 0.5390625, "learning_rate": 4.107925801011804e-07, "loss": 0.0005, "reward": 3.228265881538391, "reward_std": 0.20059365965425968, "rewards/final_reward": 1.4879712694947818, "rewards/mask_iou_reward": 0.7439856347473909, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2282660007476807, "rewards/thk_ans_format_reward": 1.0, "step": 1747, "think_completion_length": 44.0 }, { "clip_ratio": 0.0, "completion_length": 205.828125, "epoch": 2.9510961214165263, "grad_norm": 10.569300190662831, "kl": 0.4287109375, "learning_rate": 4.1045531197301853e-07, "loss": 0.0003, "reward": 3.587049722671509, "reward_std": 0.06058724317699671, "rewards/final_reward": 1.2193715290183142, "rewards/mask_iou_reward": 0.6096857645091571, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5870497226715088, "rewards/thk_ans_format_reward": 1.0, "step": 1748, "think_completion_length": 47.75 }, { "clip_ratio": 0.0, "completion_length": 180.578125, "epoch": 2.9527824620573355, "grad_norm": 14.96704643685889, "kl": 0.478515625, "learning_rate": 4.1011804384485667e-07, "loss": 0.0005, "reward": 3.2427884340286255, "reward_std": 0.026081462390720844, "rewards/final_reward": 1.0459886978186481, "rewards/mask_iou_reward": 0.5229943489093241, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.242788314819336, "rewards/thk_ans_format_reward": 1.0, "step": 1749, "think_completion_length": 47.65625 }, { "clip_ratio": 0.0, "completion_length": 132.484375, "epoch": 2.954468802698145, "grad_norm": 7.001821709693498, "kl": 0.55078125, "learning_rate": 4.097807757166947e-07, "loss": 0.0005, "reward": 3.2535473108291626, "reward_std": 0.057197438552975655, "rewards/final_reward": 1.3538752702792123, "rewards/mask_iou_reward": 0.6769376351396061, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.253547340631485, "rewards/thk_ans_format_reward": 1.0, "step": 1750, "think_completion_length": 42.09375 }, { "clip_ratio": 0.0, "completion_length": 113.953125, "epoch": 2.9561551433389543, "grad_norm": 11.32756564935043, "kl": 0.55859375, "learning_rate": 4.0944350758853285e-07, "loss": 0.0006, "reward": 3.7669692039489746, "reward_std": 0.03139904234558344, "rewards/final_reward": 1.6746801459710268, "rewards/mask_iou_reward": 0.8373400729855134, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7669692635536194, "rewards/thk_ans_format_reward": 1.0, "step": 1751, "think_completion_length": 42.78125 }, { "clip_ratio": 0.0, "completion_length": 139.671875, "epoch": 2.957841483979764, "grad_norm": 5.598772715073544, "kl": 0.587890625, "learning_rate": 4.09106239460371e-07, "loss": 0.0006, "reward": 3.5666593313217163, "reward_std": 0.09166042506694794, "rewards/final_reward": 1.5495096974513012, "rewards/mask_iou_reward": 0.7747548487256506, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5666593313217163, "rewards/thk_ans_format_reward": 1.0, "step": 1752, "think_completion_length": 46.6875 }, { "clip_ratio": 0.0, "completion_length": 143.953125, "epoch": 2.959527824620573, "grad_norm": 13.584137909097144, "kl": 0.6875, "learning_rate": 4.0876897133220913e-07, "loss": 0.0007, "reward": 3.685564637184143, "reward_std": 0.043231220450252295, "rewards/final_reward": 1.5597897837836001, "rewards/mask_iou_reward": 0.7798948918918001, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.685564637184143, "rewards/thk_ans_format_reward": 1.0, "step": 1753, "think_completion_length": 44.125 }, { "clip_ratio": 0.0, "completion_length": 241.140625, "epoch": 2.961214165261383, "grad_norm": 22.40003494317226, "kl": 0.552734375, "learning_rate": 4.0843170320404716e-07, "loss": 0.0006, "reward": 3.3257750272750854, "reward_std": 0.2577382028102875, "rewards/final_reward": 1.2813706582980735, "rewards/mask_iou_reward": 0.6406853291490368, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3257750272750854, "rewards/thk_ans_format_reward": 1.0, "step": 1754, "think_completion_length": 45.78125 }, { "clip_ratio": 0.0, "completion_length": 106.921875, "epoch": 2.9629005059021924, "grad_norm": 26.131581991547865, "kl": 0.673828125, "learning_rate": 4.080944350758853e-07, "loss": 0.0007, "reward": 3.4210046529769897, "reward_std": 0.24233996961265802, "rewards/final_reward": 1.1368643662064861, "rewards/mask_iou_reward": 0.5684321831032431, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4210045337677002, "rewards/thk_ans_format_reward": 1.0, "step": 1755, "think_completion_length": 43.59375 }, { "clip_ratio": 0.0, "completion_length": 146.078125, "epoch": 2.9645868465430016, "grad_norm": 11.690378243373605, "kl": 0.546875, "learning_rate": 4.0775716694772344e-07, "loss": 0.0006, "reward": 3.2714394330978394, "reward_std": 0.43962132930755615, "rewards/final_reward": 0.7825476837291065, "rewards/mask_iou_reward": 0.3912738418645533, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2714394330978394, "rewards/thk_ans_format_reward": 1.0, "step": 1756, "think_completion_length": 49.1875 }, { "clip_ratio": 0.0, "completion_length": 175.609375, "epoch": 2.9662731871838113, "grad_norm": 25.324780980102272, "kl": 0.470703125, "learning_rate": 4.0741989881956153e-07, "loss": 0.0005, "reward": 3.5654423236846924, "reward_std": 0.06877763196825981, "rewards/final_reward": 1.6263870640270404, "rewards/mask_iou_reward": 0.8131935320135202, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5654422640800476, "rewards/thk_ans_format_reward": 1.0, "step": 1757, "think_completion_length": 46.28125 }, { "clip_ratio": 0.0, "completion_length": 120.75, "epoch": 2.9679595278246205, "grad_norm": 4.14111248427749, "kl": 0.537109375, "learning_rate": 4.070826306913996e-07, "loss": 0.0005, "reward": 3.3682453632354736, "reward_std": 0.16081257164478302, "rewards/final_reward": 1.6700165070119228, "rewards/mask_iou_reward": 0.8350082535059614, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3682452738285065, "rewards/thk_ans_format_reward": 1.0, "step": 1758, "think_completion_length": 45.53125 }, { "clip_ratio": 0.0, "completion_length": 142.109375, "epoch": 2.96964586846543, "grad_norm": 6.129164184058998, "kl": 0.5380859375, "learning_rate": 4.0674536256323776e-07, "loss": 0.0005, "reward": 3.21283495426178, "reward_std": 0.12503460049629211, "rewards/final_reward": 1.8603373570177637, "rewards/mask_iou_reward": 0.9301686785088819, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2128351330757141, "rewards/thk_ans_format_reward": 1.0, "step": 1759, "think_completion_length": 46.34375 }, { "clip_ratio": 0.0, "completion_length": 116.9375, "epoch": 2.9713322091062393, "grad_norm": 13.32716958264156, "kl": 0.56640625, "learning_rate": 4.0640809443507585e-07, "loss": 0.0006, "reward": 3.5802561044692993, "reward_std": 0.13087457790970802, "rewards/final_reward": 1.5221519900021883, "rewards/mask_iou_reward": 0.7610759950010941, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5802561044692993, "rewards/thk_ans_format_reward": 1.0, "step": 1760, "think_completion_length": 47.96875 }, { "clip_ratio": 0.0, "completion_length": 118.984375, "epoch": 2.973018549747049, "grad_norm": 5.719532211581828, "kl": 0.529296875, "learning_rate": 4.06070826306914e-07, "loss": 0.0005, "reward": 3.140426754951477, "reward_std": 0.013218533713370562, "rewards/final_reward": 1.105653311013493, "rewards/mask_iou_reward": 0.5528266555067465, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.140426754951477, "rewards/thk_ans_format_reward": 1.0, "step": 1761, "think_completion_length": 48.96875 }, { "clip_ratio": 0.0, "completion_length": 121.78125, "epoch": 2.9747048903878586, "grad_norm": 8.80106383889409, "kl": 0.611328125, "learning_rate": 4.057335581787521e-07, "loss": 0.0006, "reward": 3.4417465925216675, "reward_std": 0.09109633043408394, "rewards/final_reward": 1.3834932937002937, "rewards/mask_iou_reward": 0.6917466468501469, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.441746711730957, "rewards/thk_ans_format_reward": 1.0, "step": 1762, "think_completion_length": 46.5625 }, { "clip_ratio": 0.0, "completion_length": 130.96875, "epoch": 2.9763912310286678, "grad_norm": 5.198333548111168, "kl": 0.603515625, "learning_rate": 4.053962900505902e-07, "loss": 0.0006, "reward": 3.007934093475342, "reward_std": 0.30538563430309296, "rewards/final_reward": 0.8716766112194474, "rewards/mask_iou_reward": 0.4358383056097237, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0079339742660522, "rewards/thk_ans_format_reward": 1.0, "step": 1763, "think_completion_length": 43.8125 }, { "clip_ratio": 0.0, "completion_length": 153.46875, "epoch": 2.9780775716694774, "grad_norm": 8.420377628718501, "kl": 0.52734375, "learning_rate": 4.050590219224283e-07, "loss": 0.0005, "reward": 2.945202946662903, "reward_std": 0.10134740360081196, "rewards/final_reward": 1.0092593098713687, "rewards/mask_iou_reward": 0.5046296549356843, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9452029466629028, "rewards/thk_ans_format_reward": 1.0, "step": 1764, "think_completion_length": 45.6875 }, { "clip_ratio": 0.0, "completion_length": 152.453125, "epoch": 2.9797639123102866, "grad_norm": 6.42537877319895, "kl": 0.580078125, "learning_rate": 4.0472175379426644e-07, "loss": 0.0006, "reward": 2.7219390869140625, "reward_std": 0.10427241958677769, "rewards/final_reward": 0.5188378030255072, "rewards/mask_iou_reward": 0.2594189015127536, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.721939206123352, "rewards/thk_ans_format_reward": 1.0, "step": 1765, "think_completion_length": 41.96875 }, { "clip_ratio": 0.0, "completion_length": 118.78125, "epoch": 2.9814502529510962, "grad_norm": 9.522081916215093, "kl": 0.5546875, "learning_rate": 4.043844856661046e-07, "loss": 0.0006, "reward": 3.8684887886047363, "reward_std": 0.01769642811268568, "rewards/final_reward": 1.8679418511744035, "rewards/mask_iou_reward": 0.9339709255872017, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8684889078140259, "rewards/thk_ans_format_reward": 1.0, "step": 1766, "think_completion_length": 46.84375 }, { "clip_ratio": 0.0, "completion_length": 127.84375, "epoch": 2.9831365935919054, "grad_norm": 13.612631308119484, "kl": 0.630859375, "learning_rate": 4.040472175379426e-07, "loss": 0.0006, "reward": 3.1474589109420776, "reward_std": 0.029912306927144527, "rewards/final_reward": 1.2405054257944175, "rewards/mask_iou_reward": 0.6202527128972087, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1474591195583344, "rewards/thk_ans_format_reward": 1.0, "step": 1767, "think_completion_length": 43.59375 }, { "clip_ratio": 0.0, "completion_length": 124.578125, "epoch": 2.984822934232715, "grad_norm": 19.224794027956456, "kl": 0.5224609375, "learning_rate": 4.0370994940978076e-07, "loss": 0.0005, "reward": 3.060731887817383, "reward_std": 0.10396159812808037, "rewards/final_reward": 1.2167539201734914, "rewards/mask_iou_reward": 0.6083769600867457, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.060731828212738, "rewards/thk_ans_format_reward": 1.0, "step": 1768, "think_completion_length": 37.34375 }, { "clip_ratio": 0.0, "completion_length": 176.796875, "epoch": 2.9865092748735247, "grad_norm": 7.278733591497656, "kl": 0.4716796875, "learning_rate": 4.033726812816189e-07, "loss": 0.0004, "reward": 3.6651105880737305, "reward_std": 0.30302999913692474, "rewards/final_reward": 1.6531495825342206, "rewards/mask_iou_reward": 0.8265747912671103, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.6807357668876648, "rewards/thk_ans_format_reward": 1.0, "step": 1769, "think_completion_length": 49.5 }, { "clip_ratio": 0.0, "completion_length": 113.0625, "epoch": 2.988195615514334, "grad_norm": 15.610608125179827, "kl": 0.609375, "learning_rate": 4.03035413153457e-07, "loss": 0.0006, "reward": 3.306709885597229, "reward_std": 0.160341314971447, "rewards/final_reward": 1.3282952467844442, "rewards/mask_iou_reward": 0.6641476233922221, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.306709885597229, "rewards/thk_ans_format_reward": 1.0, "step": 1770, "think_completion_length": 43.15625 }, { "clip_ratio": 0.0, "completion_length": 129.8125, "epoch": 2.989881956155143, "grad_norm": 19.309088968802552, "kl": 0.576171875, "learning_rate": 4.0269814502529507e-07, "loss": 0.0006, "reward": 3.4969851970672607, "reward_std": 0.049538787454366684, "rewards/final_reward": 1.8811737143384755, "rewards/mask_iou_reward": 0.9405868571692377, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4969850778579712, "rewards/thk_ans_format_reward": 1.0, "step": 1771, "think_completion_length": 45.1875 }, { "clip_ratio": 0.0, "completion_length": 204.921875, "epoch": 2.9915682967959527, "grad_norm": 8.792433933486913, "kl": 0.4873046875, "learning_rate": 4.023608768971332e-07, "loss": 0.0005, "reward": 3.364785671234131, "reward_std": 0.1412600614130497, "rewards/final_reward": 1.4665705126725315, "rewards/mask_iou_reward": 0.7332852563362657, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.364785611629486, "rewards/thk_ans_format_reward": 1.0, "step": 1772, "think_completion_length": 38.71875 }, { "clip_ratio": 0.0, "completion_length": 131.625, "epoch": 2.9932546374367623, "grad_norm": 6.145988479809462, "kl": 0.58203125, "learning_rate": 4.020236087689713e-07, "loss": 0.0006, "reward": 3.1206058263778687, "reward_std": 0.20862603932619095, "rewards/final_reward": 1.0969574547093313, "rewards/mask_iou_reward": 0.5484787273546656, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1206059157848358, "rewards/thk_ans_format_reward": 1.0, "step": 1773, "think_completion_length": 45.9375 }, { "clip_ratio": 0.0, "completion_length": 112.1875, "epoch": 2.9949409780775715, "grad_norm": 9.213062201212532, "kl": 0.5859375, "learning_rate": 4.0168634064080944e-07, "loss": 0.0006, "reward": 3.3638638257980347, "reward_std": 0.18089959397912025, "rewards/final_reward": 1.077929304204421, "rewards/mask_iou_reward": 0.5389646521022105, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3638638854026794, "rewards/thk_ans_format_reward": 1.0, "step": 1774, "think_completion_length": 42.1875 }, { "clip_ratio": 0.0, "completion_length": 162.046875, "epoch": 2.996627318718381, "grad_norm": 11.1703013355821, "kl": 0.59375, "learning_rate": 4.013490725126475e-07, "loss": 0.0006, "reward": 2.8246114253997803, "reward_std": 0.32476918399333954, "rewards/final_reward": 1.1536238002372674, "rewards/mask_iou_reward": 0.5768119001186337, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.8558615148067474, "rewards/thk_ans_format_reward": 0.984375, "step": 1775, "think_completion_length": 46.09375 }, { "clip_ratio": 0.0, "completion_length": 106.66666793823242, "epoch": 2.998313659359191, "grad_norm": 8.430944669515075, "kl": 0.634765625, "learning_rate": 4.0101180438448567e-07, "loss": 0.0007, "reward": 3.570656657218933, "reward_std": 0.26290661143139005, "rewards/final_reward": 1.7582903216175922, "rewards/mask_iou_reward": 0.8791451608087961, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5706565380096436, "rewards/thk_ans_format_reward": 1.0, "step": 1776, "think_completion_length": 39.84375 }, { "clip_ratio": 0.0, "completion_length": 112.125, "epoch": 3.0016863406408096, "grad_norm": 7.531012330436627, "kl": 0.63671875, "learning_rate": 4.0067453625632375e-07, "loss": 0.0006, "reward": 3.4569878578186035, "reward_std": 0.05659590847790241, "rewards/final_reward": 1.430502925295769, "rewards/mask_iou_reward": 0.7152514626478845, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.456987977027893, "rewards/thk_ans_format_reward": 1.0, "step": 1777, "think_completion_length": 43.9375 }, { "clip_ratio": 0.0, "completion_length": 110.265625, "epoch": 3.003372681281619, "grad_norm": 12.588023913728449, "kl": 0.904296875, "learning_rate": 4.003372681281619e-07, "loss": 0.0009, "reward": 2.8988207578659058, "reward_std": 0.06972062401473522, "rewards/final_reward": 0.8970200385350756, "rewards/mask_iou_reward": 0.4485100192675378, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8988207578659058, "rewards/thk_ans_format_reward": 1.0, "step": 1778, "think_completion_length": 41.9375 }, { "clip_ratio": 0.0, "completion_length": 111.28125, "epoch": 3.0050590219224285, "grad_norm": 7.991088506498573, "kl": 0.6171875, "learning_rate": 4e-07, "loss": 0.0006, "reward": 3.4317870140075684, "reward_std": 0.06494680885225534, "rewards/final_reward": 1.5347575528414017, "rewards/mask_iou_reward": 0.7673787764207008, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4317869544029236, "rewards/thk_ans_format_reward": 1.0, "step": 1779, "think_completion_length": 42.96875 }, { "clip_ratio": 0.0, "completion_length": 114.6875, "epoch": 3.0067453625632377, "grad_norm": 6.296840375046232, "kl": 0.58984375, "learning_rate": 3.9966273187183807e-07, "loss": 0.0006, "reward": 3.519545555114746, "reward_std": 0.0419915160164237, "rewards/final_reward": 1.8707376427400908, "rewards/mask_iou_reward": 0.9353688213700454, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.519545555114746, "rewards/thk_ans_format_reward": 1.0, "step": 1780, "think_completion_length": 43.46875 }, { "clip_ratio": 0.0, "completion_length": 109.015625, "epoch": 3.0084317032040473, "grad_norm": 10.765617905439093, "kl": 0.62890625, "learning_rate": 3.993254637436762e-07, "loss": 0.0006, "reward": 2.860144257545471, "reward_std": 0.21056190133094788, "rewards/final_reward": 0.46607457041584754, "rewards/mask_iou_reward": 0.23303728520792377, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8601441979408264, "rewards/thk_ans_format_reward": 1.0, "step": 1781, "think_completion_length": 42.78125 }, { "clip_ratio": 0.0, "completion_length": 116.21875, "epoch": 3.0101180438448565, "grad_norm": 12.267309726746078, "kl": 0.59375, "learning_rate": 3.9898819561551435e-07, "loss": 0.0006, "reward": 3.506497859954834, "reward_std": 0.07470344379544258, "rewards/final_reward": 1.2161605416955577, "rewards/mask_iou_reward": 0.6080802708477788, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.506497859954834, "rewards/thk_ans_format_reward": 1.0, "step": 1782, "think_completion_length": 38.40625 }, { "clip_ratio": 0.0, "completion_length": 114.109375, "epoch": 3.011804384485666, "grad_norm": 5.569775037193244, "kl": 0.609375, "learning_rate": 3.986509274873524e-07, "loss": 0.0006, "reward": 3.2471498250961304, "reward_std": 0.023415432777255774, "rewards/final_reward": 1.3677550518400219, "rewards/mask_iou_reward": 0.6838775259200109, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2471499741077423, "rewards/thk_ans_format_reward": 1.0, "step": 1783, "think_completion_length": 42.375 }, { "clip_ratio": 0.0, "completion_length": 115.1875, "epoch": 3.0134907251264758, "grad_norm": 13.149543840594943, "kl": 0.6640625, "learning_rate": 3.983136593591905e-07, "loss": 0.0007, "reward": 3.5586479902267456, "reward_std": 0.0933561883866787, "rewards/final_reward": 1.5804140872277448, "rewards/mask_iou_reward": 0.7902070436138724, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5586479306221008, "rewards/thk_ans_format_reward": 1.0, "step": 1784, "think_completion_length": 40.15625 }, { "clip_ratio": 0.0, "completion_length": 133.265625, "epoch": 3.015177065767285, "grad_norm": 6.182000400527594, "kl": 0.7109375, "learning_rate": 3.9797639123102867e-07, "loss": 0.0007, "reward": 3.558907985687256, "reward_std": 0.04706683196127415, "rewards/final_reward": 1.738189228029709, "rewards/mask_iou_reward": 0.8690946140148545, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5589080452919006, "rewards/thk_ans_format_reward": 1.0, "step": 1785, "think_completion_length": 38.09375 }, { "clip_ratio": 0.0, "completion_length": 114.515625, "epoch": 3.0168634064080946, "grad_norm": 18.46072762580063, "kl": 0.5625, "learning_rate": 3.9763912310286675e-07, "loss": 0.0006, "reward": 3.6227529048919678, "reward_std": 0.2536686926614493, "rewards/final_reward": 1.8278806704202584, "rewards/mask_iou_reward": 0.9139403352101292, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6227527856826782, "rewards/thk_ans_format_reward": 1.0, "step": 1786, "think_completion_length": 43.21875 }, { "clip_ratio": 0.0, "completion_length": 113.515625, "epoch": 3.0185497470489038, "grad_norm": 5.981655691507406, "kl": 0.6484375, "learning_rate": 3.9730185497470484e-07, "loss": 0.0006, "reward": 3.110731363296509, "reward_std": 0.10306009650230408, "rewards/final_reward": 1.2463825041064651, "rewards/mask_iou_reward": 0.6231912520532326, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1107314229011536, "rewards/thk_ans_format_reward": 1.0, "step": 1787, "think_completion_length": 47.09375 }, { "clip_ratio": 0.0, "completion_length": 182.578125, "epoch": 3.0202360876897134, "grad_norm": 19.93758781645736, "kl": 0.70703125, "learning_rate": 3.96964586846543e-07, "loss": 0.0007, "reward": 2.9761908054351807, "reward_std": 0.2858365625143051, "rewards/final_reward": 1.15384431231489, "rewards/mask_iou_reward": 0.576922156157445, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.007440596818924, "rewards/thk_ans_format_reward": 0.984375, "step": 1788, "think_completion_length": 44.875 }, { "clip_ratio": 0.0, "completion_length": 109.84375, "epoch": 3.0219224283305226, "grad_norm": 8.834083836552558, "kl": 0.576171875, "learning_rate": 3.966273187183811e-07, "loss": 0.0006, "reward": 3.8274420499801636, "reward_std": 0.04052088037133217, "rewards/final_reward": 1.8624797041000785, "rewards/mask_iou_reward": 0.9312398520500392, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8274420499801636, "rewards/thk_ans_format_reward": 1.0, "step": 1789, "think_completion_length": 40.0 }, { "clip_ratio": 0.0, "completion_length": 112.71875, "epoch": 3.0236087689713322, "grad_norm": 7.803750825223315, "kl": 0.59765625, "learning_rate": 3.962900505902192e-07, "loss": 0.0005, "reward": 2.902255654335022, "reward_std": 0.09178433939814568, "rewards/final_reward": 1.4428867109175392, "rewards/mask_iou_reward": 0.7214433554587696, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9022556245326996, "rewards/thk_ans_format_reward": 1.0, "step": 1790, "think_completion_length": 41.375 }, { "clip_ratio": 0.0, "completion_length": 110.234375, "epoch": 3.0252951096121414, "grad_norm": 7.234996353582277, "kl": 0.953125, "learning_rate": 3.9595278246205735e-07, "loss": 0.0009, "reward": 3.684625029563904, "reward_std": 0.034050445072352886, "rewards/final_reward": 1.7472793780932192, "rewards/mask_iou_reward": 0.8736396890466096, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6846250891685486, "rewards/thk_ans_format_reward": 1.0, "step": 1791, "think_completion_length": 39.53125 }, { "clip_ratio": 0.0, "completion_length": 108.484375, "epoch": 3.026981450252951, "grad_norm": 7.483545957839358, "kl": 0.609375, "learning_rate": 3.9561551433389544e-07, "loss": 0.0006, "reward": 3.345807671546936, "reward_std": 0.21587074548006058, "rewards/final_reward": 1.7519694489910984, "rewards/mask_iou_reward": 0.8759847244955492, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.345807671546936, "rewards/thk_ans_format_reward": 1.0, "step": 1792, "think_completion_length": 39.34375 }, { "clip_ratio": 0.0, "completion_length": 112.546875, "epoch": 3.0286677908937607, "grad_norm": 9.931220248630632, "kl": 0.609375, "learning_rate": 3.952782462057335e-07, "loss": 0.0006, "reward": 3.640244960784912, "reward_std": 0.11672806553542614, "rewards/final_reward": 1.5120892039081364, "rewards/mask_iou_reward": 0.7560446019540682, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6402450799942017, "rewards/thk_ans_format_reward": 1.0, "step": 1793, "think_completion_length": 40.59375 }, { "clip_ratio": 0.0, "completion_length": 125.640625, "epoch": 3.03035413153457, "grad_norm": 26.60166896179453, "kl": 0.54296875, "learning_rate": 3.9494097807757166e-07, "loss": 0.0006, "reward": 3.89884877204895, "reward_std": 0.1477795336395502, "rewards/final_reward": 1.89302786543562, "rewards/mask_iou_reward": 0.94651393271781, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.898848831653595, "rewards/thk_ans_format_reward": 1.0, "step": 1794, "think_completion_length": 43.03125 }, { "clip_ratio": 0.0, "completion_length": 115.0, "epoch": 3.0320404721753795, "grad_norm": 14.90534750812744, "kl": 0.6015625, "learning_rate": 3.946037099494098e-07, "loss": 0.0006, "reward": 3.7795809507369995, "reward_std": 0.018881912576034665, "rewards/final_reward": 1.8978212875884442, "rewards/mask_iou_reward": 0.9489106437942221, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.77958083152771, "rewards/thk_ans_format_reward": 1.0, "step": 1795, "think_completion_length": 43.5 }, { "clip_ratio": 0.0, "completion_length": 139.4375, "epoch": 3.0337268128161887, "grad_norm": 7.302222131132335, "kl": 0.537109375, "learning_rate": 3.9426644182124784e-07, "loss": 0.0005, "reward": 3.4302122592926025, "reward_std": 0.11048243194818497, "rewards/final_reward": 1.775548221195335, "rewards/mask_iou_reward": 0.8877741105976675, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.430212378501892, "rewards/thk_ans_format_reward": 1.0, "step": 1796, "think_completion_length": 41.0625 }, { "clip_ratio": 0.0, "completion_length": 113.953125, "epoch": 3.0354131534569984, "grad_norm": 8.81006701656291, "kl": 0.595703125, "learning_rate": 3.93929173693086e-07, "loss": 0.0006, "reward": 3.185991048812866, "reward_std": 0.052945384522899985, "rewards/final_reward": 0.7679530304436115, "rewards/mask_iou_reward": 0.38397651522180576, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.185991108417511, "rewards/thk_ans_format_reward": 1.0, "step": 1797, "think_completion_length": 38.625 }, { "clip_ratio": 0.0, "completion_length": 108.640625, "epoch": 3.0370994940978076, "grad_norm": 5.821901064823728, "kl": 0.640625, "learning_rate": 3.935919055649241e-07, "loss": 0.0007, "reward": 3.6197181940078735, "reward_std": 0.035361507907509804, "rewards/final_reward": 1.424851699363638, "rewards/mask_iou_reward": 0.712425849681819, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.619718074798584, "rewards/thk_ans_format_reward": 1.0, "step": 1798, "think_completion_length": 37.53125 }, { "clip_ratio": 0.0, "completion_length": 148.546875, "epoch": 3.038785834738617, "grad_norm": 10.127292142360464, "kl": 0.548828125, "learning_rate": 3.932546374367622e-07, "loss": 0.0005, "reward": 3.2065749168395996, "reward_std": 0.18304883688688278, "rewards/final_reward": 1.2521124267152612, "rewards/mask_iou_reward": 0.6260562133576306, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2065749168395996, "rewards/thk_ans_format_reward": 1.0, "step": 1799, "think_completion_length": 42.1875 }, { "clip_ratio": 0.0, "completion_length": 126.40625, "epoch": 3.040472175379427, "grad_norm": 8.878520520027035, "kl": 0.564453125, "learning_rate": 3.929173693086003e-07, "loss": 0.0006, "reward": 3.5881153345108032, "reward_std": 0.03201424656435847, "rewards/final_reward": 1.4877122646509164, "rewards/mask_iou_reward": 0.7438561323254582, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5881155133247375, "rewards/thk_ans_format_reward": 1.0, "step": 1800, "think_completion_length": 40.5 }, { "clip_ratio": 0.0, "completion_length": 107.90625, "epoch": 3.042158516020236, "grad_norm": 7.864526301429089, "kl": 0.587890625, "learning_rate": 3.9258010118043843e-07, "loss": 0.0005, "reward": 3.623053550720215, "reward_std": 0.2628798196092248, "rewards/final_reward": 1.631788901154074, "rewards/mask_iou_reward": 0.815894450577037, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6230534315109253, "rewards/thk_ans_format_reward": 1.0, "step": 1801, "think_completion_length": 45.5625 }, { "clip_ratio": 0.0, "completion_length": 116.296875, "epoch": 3.0438448566610457, "grad_norm": 7.755192761497758, "kl": 0.595703125, "learning_rate": 3.922428330522766e-07, "loss": 0.0006, "reward": 3.6610530614852905, "reward_std": 0.06777806580066681, "rewards/final_reward": 1.803045456398975, "rewards/mask_iou_reward": 0.9015227281994875, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6610528826713562, "rewards/thk_ans_format_reward": 1.0, "step": 1802, "think_completion_length": 45.84375 }, { "clip_ratio": 0.0, "completion_length": 112.53125, "epoch": 3.045531197301855, "grad_norm": 11.508893233818217, "kl": 0.708984375, "learning_rate": 3.9190556492411466e-07, "loss": 0.0007, "reward": 2.952297568321228, "reward_std": 0.0924637708812952, "rewards/final_reward": 0.9303527118823702, "rewards/mask_iou_reward": 0.4651763559411851, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.952297568321228, "rewards/thk_ans_format_reward": 1.0, "step": 1803, "think_completion_length": 44.59375 }, { "clip_ratio": 0.0, "completion_length": 144.578125, "epoch": 3.0472175379426645, "grad_norm": 8.761930391197803, "kl": 0.625, "learning_rate": 3.9156829679595275e-07, "loss": 0.0006, "reward": 3.518397331237793, "reward_std": 0.10372760146856308, "rewards/final_reward": 1.7520384498088726, "rewards/mask_iou_reward": 0.8760192249044363, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5183972716331482, "rewards/thk_ans_format_reward": 1.0, "step": 1804, "think_completion_length": 39.0625 }, { "clip_ratio": 0.0, "completion_length": 116.921875, "epoch": 3.0489038785834737, "grad_norm": 6.091115422282127, "kl": 0.62109375, "learning_rate": 3.912310286677909e-07, "loss": 0.0006, "reward": 3.6024467945098877, "reward_std": 0.038248912431299686, "rewards/final_reward": 1.5841322268045297, "rewards/mask_iou_reward": 0.7920661134022648, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6024468541145325, "rewards/thk_ans_format_reward": 1.0, "step": 1805, "think_completion_length": 43.71875 }, { "clip_ratio": 0.0, "completion_length": 132.765625, "epoch": 3.0505902192242833, "grad_norm": 10.536867458322101, "kl": 0.583984375, "learning_rate": 3.90893760539629e-07, "loss": 0.0006, "reward": 3.5221776962280273, "reward_std": 0.19090192764997482, "rewards/final_reward": 1.1545756872413946, "rewards/mask_iou_reward": 0.5772878436206973, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5221776962280273, "rewards/thk_ans_format_reward": 1.0, "step": 1806, "think_completion_length": 44.0 }, { "clip_ratio": 0.0, "completion_length": 158.78125, "epoch": 3.052276559865093, "grad_norm": 5.970328374731551, "kl": 0.517578125, "learning_rate": 3.905564924114671e-07, "loss": 0.0005, "reward": 2.939510464668274, "reward_std": 0.046596916392445564, "rewards/final_reward": 1.0414819878594614, "rewards/mask_iou_reward": 0.5207409939297307, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9395104348659515, "rewards/thk_ans_format_reward": 1.0, "step": 1807, "think_completion_length": 40.0 }, { "clip_ratio": 0.0, "completion_length": 112.046875, "epoch": 3.053962900505902, "grad_norm": 8.809238899912609, "kl": 0.60546875, "learning_rate": 3.902192242833052e-07, "loss": 0.0006, "reward": 3.550176501274109, "reward_std": 0.18766392022371292, "rewards/final_reward": 1.3816421309674876, "rewards/mask_iou_reward": 0.6908210654837438, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5501765012741089, "rewards/thk_ans_format_reward": 1.0, "step": 1808, "think_completion_length": 37.0625 }, { "clip_ratio": 0.0, "completion_length": 116.734375, "epoch": 3.0556492411467118, "grad_norm": 6.52671823087687, "kl": 0.6796875, "learning_rate": 3.898819561551433e-07, "loss": 0.0007, "reward": 3.23820424079895, "reward_std": 0.14676055498421192, "rewards/final_reward": 1.2919858793710988, "rewards/mask_iou_reward": 0.6459929396855494, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2382042407989502, "rewards/thk_ans_format_reward": 1.0, "step": 1809, "think_completion_length": 48.09375 }, { "clip_ratio": 0.0, "completion_length": 166.15625, "epoch": 3.057335581787521, "grad_norm": 11.07613980798843, "kl": 0.55078125, "learning_rate": 3.8954468802698143e-07, "loss": 0.0006, "reward": 3.0210577249526978, "reward_std": 0.08838908141478896, "rewards/final_reward": 1.0451431337686916, "rewards/mask_iou_reward": 0.5225715668843458, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0210577249526978, "rewards/thk_ans_format_reward": 1.0, "step": 1810, "think_completion_length": 43.15625 }, { "clip_ratio": 0.0, "completion_length": 115.578125, "epoch": 3.0590219224283306, "grad_norm": 16.112354584949998, "kl": 0.5703125, "learning_rate": 3.8920741989881957e-07, "loss": 0.0006, "reward": 3.423478841781616, "reward_std": 0.012333399849012494, "rewards/final_reward": 0.942286153388663, "rewards/mask_iou_reward": 0.4711430766943315, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4234787225723267, "rewards/thk_ans_format_reward": 1.0, "step": 1811, "think_completion_length": 42.5 }, { "clip_ratio": 0.0, "completion_length": 116.625, "epoch": 3.06070826306914, "grad_norm": 10.626531813912603, "kl": 0.638671875, "learning_rate": 3.888701517706576e-07, "loss": 0.0006, "reward": 3.6403441429138184, "reward_std": 0.022046887315809727, "rewards/final_reward": 1.6076258115450064, "rewards/mask_iou_reward": 0.8038129057725032, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6403440237045288, "rewards/thk_ans_format_reward": 1.0, "step": 1812, "think_completion_length": 41.90625 }, { "clip_ratio": 0.0, "completion_length": 112.171875, "epoch": 3.0623946037099494, "grad_norm": 10.574157055501939, "kl": 0.626953125, "learning_rate": 3.8853288364249575e-07, "loss": 0.0006, "reward": 3.482790231704712, "reward_std": 0.10968651808798313, "rewards/final_reward": 1.584461325845456, "rewards/mask_iou_reward": 0.792230662922728, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4827901124954224, "rewards/thk_ans_format_reward": 1.0, "step": 1813, "think_completion_length": 41.53125 }, { "clip_ratio": 0.0, "completion_length": 113.625, "epoch": 3.064080944350759, "grad_norm": 8.79314057815561, "kl": 0.580078125, "learning_rate": 3.881956155143339e-07, "loss": 0.0006, "reward": 3.6398913860321045, "reward_std": 0.2976074144244194, "rewards/final_reward": 1.501840785814263, "rewards/mask_iou_reward": 0.7509203929071315, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6398914456367493, "rewards/thk_ans_format_reward": 1.0, "step": 1814, "think_completion_length": 44.8125 }, { "clip_ratio": 0.0, "completion_length": 173.515625, "epoch": 3.0657672849915683, "grad_norm": 6.234654076026071, "kl": 0.529296875, "learning_rate": 3.8785834738617203e-07, "loss": 0.0005, "reward": 3.3249377012252808, "reward_std": 0.2078157588839531, "rewards/final_reward": 1.3281841385162092, "rewards/mask_iou_reward": 0.6640920692581046, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3249376714229584, "rewards/thk_ans_format_reward": 1.0, "step": 1815, "think_completion_length": 40.90625 }, { "clip_ratio": 0.0, "completion_length": 112.0625, "epoch": 3.067453625632378, "grad_norm": 10.396544747347201, "kl": 0.564453125, "learning_rate": 3.8752107925801006e-07, "loss": 0.0006, "reward": 3.4935293197631836, "reward_std": 0.04673771560192108, "rewards/final_reward": 1.6538122748826627, "rewards/mask_iou_reward": 0.8269061374413313, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4935293197631836, "rewards/thk_ans_format_reward": 1.0, "step": 1816, "think_completion_length": 40.625 }, { "clip_ratio": 0.0, "completion_length": 136.46875, "epoch": 3.069139966273187, "grad_norm": 21.842182739016284, "kl": 0.544921875, "learning_rate": 3.871838111298482e-07, "loss": 0.0005, "reward": 3.6569454669952393, "reward_std": 0.14664340764284134, "rewards/final_reward": 1.8561856677784982, "rewards/mask_iou_reward": 0.9280928338892491, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6569453477859497, "rewards/thk_ans_format_reward": 1.0, "step": 1817, "think_completion_length": 39.25 }, { "clip_ratio": 0.0, "completion_length": 187.5625, "epoch": 3.0708263069139967, "grad_norm": 14.884495823563416, "kl": 0.45703125, "learning_rate": 3.8684654300168634e-07, "loss": 0.0005, "reward": 3.5717333555221558, "reward_std": 0.16016625985503197, "rewards/final_reward": 1.4241937334443704, "rewards/mask_iou_reward": 0.7120968667221852, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.571733295917511, "rewards/thk_ans_format_reward": 1.0, "step": 1818, "think_completion_length": 43.4375 }, { "clip_ratio": 0.0, "completion_length": 172.40625, "epoch": 3.072512647554806, "grad_norm": 100.86990875217444, "kl": 0.65234375, "learning_rate": 3.8650927487352443e-07, "loss": 0.0007, "reward": 3.7288215160369873, "reward_std": 0.055927949026227, "rewards/final_reward": 1.616559484813261, "rewards/mask_iou_reward": 0.8082797424066305, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.728821575641632, "rewards/thk_ans_format_reward": 1.0, "step": 1819, "think_completion_length": 43.40625 }, { "clip_ratio": 0.0, "completion_length": 119.125, "epoch": 3.0741989881956155, "grad_norm": 34.84247946910707, "kl": 0.59375, "learning_rate": 3.861720067453625e-07, "loss": 0.0006, "reward": 3.3209747076034546, "reward_std": 0.15957476571202278, "rewards/final_reward": 1.6474255953448638, "rewards/mask_iou_reward": 0.8237127976724319, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3209747672080994, "rewards/thk_ans_format_reward": 1.0, "step": 1820, "think_completion_length": 47.0625 }, { "clip_ratio": 0.0, "completion_length": 130.890625, "epoch": 3.075885328836425, "grad_norm": 9.821093622718976, "kl": 0.541015625, "learning_rate": 3.8583473861720066e-07, "loss": 0.0005, "reward": 3.1136986017227173, "reward_std": 0.3179262578487396, "rewards/final_reward": 1.0161626959025258, "rewards/mask_iou_reward": 0.5080813479512629, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1136985421180725, "rewards/thk_ans_format_reward": 1.0, "step": 1821, "think_completion_length": 50.5625 }, { "clip_ratio": 0.0, "completion_length": 161.71875, "epoch": 3.0775716694772344, "grad_norm": 8.86875330195681, "kl": 0.609375, "learning_rate": 3.8549747048903875e-07, "loss": 0.0006, "reward": 3.530484437942505, "reward_std": 0.10517753660678864, "rewards/final_reward": 1.6550108728196373, "rewards/mask_iou_reward": 0.8275054364098187, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5304844379425049, "rewards/thk_ans_format_reward": 1.0, "step": 1822, "think_completion_length": 40.40625 }, { "clip_ratio": 0.0, "completion_length": 173.640625, "epoch": 3.079258010118044, "grad_norm": 10.663893597077623, "kl": 0.541015625, "learning_rate": 3.851602023608769e-07, "loss": 0.0005, "reward": 3.1298916339874268, "reward_std": 0.1926565244793892, "rewards/final_reward": 1.1494982733137804, "rewards/mask_iou_reward": 0.5747491366568902, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1298914551734924, "rewards/thk_ans_format_reward": 1.0, "step": 1823, "think_completion_length": 45.59375 }, { "clip_ratio": 0.0, "completion_length": 127.53125, "epoch": 3.080944350758853, "grad_norm": 5.634180631015267, "kl": 0.54296875, "learning_rate": 3.84822934232715e-07, "loss": 0.0005, "reward": 3.306205630302429, "reward_std": 0.134456398896873, "rewards/final_reward": 1.1425007583452398, "rewards/mask_iou_reward": 0.5712503791726199, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3062056303024292, "rewards/thk_ans_format_reward": 1.0, "step": 1824, "think_completion_length": 46.125 }, { "clip_ratio": 0.0, "completion_length": 125.875, "epoch": 3.082630691399663, "grad_norm": 9.726825128136838, "kl": 1.09375, "learning_rate": 3.8448566610455306e-07, "loss": 0.001, "reward": 3.8346996307373047, "reward_std": 0.02099014213308692, "rewards/final_reward": 1.9401270741644736, "rewards/mask_iou_reward": 0.9700635370822368, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.834699809551239, "rewards/thk_ans_format_reward": 1.0, "step": 1825, "think_completion_length": 46.0 }, { "clip_ratio": 0.0, "completion_length": 126.296875, "epoch": 3.084317032040472, "grad_norm": 8.46581095839141, "kl": 0.578125, "learning_rate": 3.841483979763912e-07, "loss": 0.0006, "reward": 3.462043046951294, "reward_std": 0.1276659220457077, "rewards/final_reward": 1.5772126033320073, "rewards/mask_iou_reward": 0.7886063016660037, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4620429277420044, "rewards/thk_ans_format_reward": 1.0, "step": 1826, "think_completion_length": 44.15625 }, { "clip_ratio": 0.0, "completion_length": 183.640625, "epoch": 3.0860033726812817, "grad_norm": 6.099141209808028, "kl": 0.4775390625, "learning_rate": 3.8381112984822934e-07, "loss": 0.0005, "reward": 3.5183308124542236, "reward_std": 0.0571708045899868, "rewards/final_reward": 1.3577895920311414, "rewards/mask_iou_reward": 0.6788947960155707, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.518330991268158, "rewards/thk_ans_format_reward": 1.0, "step": 1827, "think_completion_length": 44.71875 }, { "clip_ratio": 0.0, "completion_length": 250.375, "epoch": 3.087689713322091, "grad_norm": 6.6582942751110705, "kl": 0.5400390625, "learning_rate": 3.834738617200675e-07, "loss": 0.0005, "reward": 3.124674081802368, "reward_std": 0.617139033973217, "rewards/final_reward": 1.416433364585445, "rewards/mask_iou_reward": 0.7082166822927225, "rewards/sam_format_reward": 0.9375, "rewards/sam_reward_func_ultra": 1.249674141407013, "rewards/thk_ans_format_reward": 0.9375, "step": 1828, "think_completion_length": 44.28125 }, { "clip_ratio": 0.0, "completion_length": 112.125, "epoch": 3.0893760539629005, "grad_norm": 10.00986038855336, "kl": 0.623046875, "learning_rate": 3.831365935919055e-07, "loss": 0.0006, "reward": 3.3493294715881348, "reward_std": 0.11154869198799133, "rewards/final_reward": 1.6091478659368443, "rewards/mask_iou_reward": 0.8045739329684222, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3493293523788452, "rewards/thk_ans_format_reward": 1.0, "step": 1829, "think_completion_length": 39.9375 }, { "clip_ratio": 0.0, "completion_length": 99.734375, "epoch": 3.09106239460371, "grad_norm": 8.046581936185516, "kl": 0.658203125, "learning_rate": 3.8279932546374366e-07, "loss": 0.0007, "reward": 3.2317744493484497, "reward_std": 0.11304565519094467, "rewards/final_reward": 1.505587991593928, "rewards/mask_iou_reward": 0.752793995796964, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2317743301391602, "rewards/thk_ans_format_reward": 1.0, "step": 1830, "think_completion_length": 41.65625 }, { "clip_ratio": 0.0, "completion_length": 116.1875, "epoch": 3.0927487352445193, "grad_norm": 26.068413271225808, "kl": 0.5859375, "learning_rate": 3.824620573355818e-07, "loss": 0.0006, "reward": 3.3999075889587402, "reward_std": 0.0778821213170886, "rewards/final_reward": 1.7049617555467194, "rewards/mask_iou_reward": 0.8524808777733597, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.399907648563385, "rewards/thk_ans_format_reward": 1.0, "step": 1831, "think_completion_length": 44.65625 }, { "clip_ratio": 0.0, "completion_length": 104.140625, "epoch": 3.094435075885329, "grad_norm": 5.737550620244319, "kl": 0.578125, "learning_rate": 3.821247892074199e-07, "loss": 0.0006, "reward": 3.555485725402832, "reward_std": 0.19017744529992342, "rewards/final_reward": 1.50405068886782, "rewards/mask_iou_reward": 0.75202534443391, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.555485486984253, "rewards/thk_ans_format_reward": 1.0, "step": 1832, "think_completion_length": 46.25 }, { "clip_ratio": 0.0, "completion_length": 172.796875, "epoch": 3.096121416526138, "grad_norm": 9.613502793709598, "kl": 0.5234375, "learning_rate": 3.8178752107925797e-07, "loss": 0.0005, "reward": 2.7982553243637085, "reward_std": 0.2548002079129219, "rewards/final_reward": 0.7696277788214507, "rewards/mask_iou_reward": 0.38481388941072536, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7982552647590637, "rewards/thk_ans_format_reward": 1.0, "step": 1833, "think_completion_length": 46.125 }, { "clip_ratio": 0.0, "completion_length": 117.84375, "epoch": 3.097807757166948, "grad_norm": 8.490748952629394, "kl": 0.61328125, "learning_rate": 3.814502529510961e-07, "loss": 0.0006, "reward": 3.291377305984497, "reward_std": 0.3434144649654627, "rewards/final_reward": 1.1587527805696234, "rewards/mask_iou_reward": 0.5793763902848117, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2913771867752075, "rewards/thk_ans_format_reward": 1.0, "step": 1834, "think_completion_length": 47.59375 }, { "clip_ratio": 0.0, "completion_length": 150.90625, "epoch": 3.099494097807757, "grad_norm": 4.401463888525623, "kl": 0.525390625, "learning_rate": 3.811129848229342e-07, "loss": 0.0005, "reward": 3.6651761531829834, "reward_std": 0.18362296093255281, "rewards/final_reward": 1.6550658545344978, "rewards/mask_iou_reward": 0.8275329272672489, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.665175974369049, "rewards/thk_ans_format_reward": 1.0, "step": 1835, "think_completion_length": 46.1875 }, { "clip_ratio": 0.0, "completion_length": 115.15625, "epoch": 3.1011804384485666, "grad_norm": 4.078236651295097, "kl": 0.5859375, "learning_rate": 3.8077571669477234e-07, "loss": 0.0006, "reward": 3.1922521591186523, "reward_std": 0.005031302338466048, "rewards/final_reward": 1.475982413029449, "rewards/mask_iou_reward": 0.7379912065147245, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1922521591186523, "rewards/thk_ans_format_reward": 1.0, "step": 1836, "think_completion_length": 49.59375 }, { "clip_ratio": 0.0, "completion_length": 158.6875, "epoch": 3.1028667790893762, "grad_norm": 7.627962147529004, "kl": 0.5595703125, "learning_rate": 3.8043844856661043e-07, "loss": 0.0006, "reward": 3.6415493488311768, "reward_std": 0.1579625979065895, "rewards/final_reward": 1.9423843051787808, "rewards/mask_iou_reward": 0.9711921525893904, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6415494084358215, "rewards/thk_ans_format_reward": 1.0, "step": 1837, "think_completion_length": 45.0625 }, { "clip_ratio": 0.0, "completion_length": 159.625, "epoch": 3.1045531197301854, "grad_norm": 18.076689222268563, "kl": 0.609375, "learning_rate": 3.8010118043844857e-07, "loss": 0.0006, "reward": 3.604332685470581, "reward_std": 0.03623810596764088, "rewards/final_reward": 1.356653644344096, "rewards/mask_iou_reward": 0.678326822172048, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6043327450752258, "rewards/thk_ans_format_reward": 1.0, "step": 1838, "think_completion_length": 50.0 }, { "clip_ratio": 0.0, "completion_length": 127.25, "epoch": 3.106239460370995, "grad_norm": 60.27899415767619, "kl": 0.697265625, "learning_rate": 3.7976391231028665e-07, "loss": 0.0007, "reward": 3.1235082149505615, "reward_std": 0.017116380273364484, "rewards/final_reward": 0.9495803673395722, "rewards/mask_iou_reward": 0.4747901836697861, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1235082745552063, "rewards/thk_ans_format_reward": 1.0, "step": 1839, "think_completion_length": 46.34375 }, { "clip_ratio": 0.0, "completion_length": 316.875, "epoch": 3.1079258010118043, "grad_norm": 10.828374197849795, "kl": 0.4296875, "learning_rate": 3.794266441821248e-07, "loss": 0.0004, "reward": 3.080757737159729, "reward_std": 0.42562781274318695, "rewards/final_reward": 1.7166104285447097, "rewards/mask_iou_reward": 0.8583052142723548, "rewards/sam_format_reward": 0.875, "rewards/sam_reward_func_ultra": 1.3307577967643738, "rewards/thk_ans_format_reward": 0.875, "step": 1840, "think_completion_length": 45.09375 }, { "clip_ratio": 0.0, "completion_length": 116.671875, "epoch": 3.109612141652614, "grad_norm": 29.597362909569423, "kl": 0.7265625, "learning_rate": 3.790893760539629e-07, "loss": 0.0005, "reward": 3.8099430799484253, "reward_std": 0.01841105322819203, "rewards/final_reward": 1.9609085731482856, "rewards/mask_iou_reward": 0.9804542865741428, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8099430799484253, "rewards/thk_ans_format_reward": 1.0, "step": 1841, "think_completion_length": 48.3125 }, { "clip_ratio": 0.0, "completion_length": 117.0, "epoch": 3.111298482293423, "grad_norm": 6.143032352992298, "kl": 0.64453125, "learning_rate": 3.7875210792580097e-07, "loss": 0.0007, "reward": 3.4265583753585815, "reward_std": 0.020277044735848904, "rewards/final_reward": 1.4126127862405058, "rewards/mask_iou_reward": 0.7063063931202529, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4265583157539368, "rewards/thk_ans_format_reward": 1.0, "step": 1842, "think_completion_length": 51.59375 }, { "clip_ratio": 0.0, "completion_length": 170.78125, "epoch": 3.1129848229342327, "grad_norm": 9.920229412362959, "kl": 0.521484375, "learning_rate": 3.784148397976391e-07, "loss": 0.0005, "reward": 3.4824771881103516, "reward_std": 0.18812450766563416, "rewards/final_reward": 1.4282994211107694, "rewards/mask_iou_reward": 0.7141497105553847, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4824773669242859, "rewards/thk_ans_format_reward": 1.0, "step": 1843, "think_completion_length": 45.09375 }, { "clip_ratio": 0.0, "completion_length": 106.125, "epoch": 3.1146711635750424, "grad_norm": 5.468730327469074, "kl": 0.568359375, "learning_rate": 3.7807757166947725e-07, "loss": 0.0006, "reward": 3.3906657695770264, "reward_std": 0.44338157773017883, "rewards/final_reward": 1.1189098370969175, "rewards/mask_iou_reward": 0.5594549185484587, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3906657695770264, "rewards/thk_ans_format_reward": 1.0, "step": 1844, "think_completion_length": 40.53125 }, { "clip_ratio": 0.0, "completion_length": 166.59375, "epoch": 3.1163575042158516, "grad_norm": 4.671244035097516, "kl": 0.587890625, "learning_rate": 3.777403035413153e-07, "loss": 0.0006, "reward": 3.467657446861267, "reward_std": 0.14314634166657925, "rewards/final_reward": 1.4949584161797376, "rewards/mask_iou_reward": 0.7474792080898688, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4676575064659119, "rewards/thk_ans_format_reward": 1.0, "step": 1845, "think_completion_length": 46.28125 }, { "clip_ratio": 0.0, "completion_length": 117.984375, "epoch": 3.118043844856661, "grad_norm": 9.86402742741037, "kl": 0.58203125, "learning_rate": 3.774030354131534e-07, "loss": 0.0006, "reward": 3.4636902809143066, "reward_std": 0.39987847208976746, "rewards/final_reward": 1.5898304441749473, "rewards/mask_iou_reward": 0.7949152220874737, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4636903405189514, "rewards/thk_ans_format_reward": 1.0, "step": 1846, "think_completion_length": 46.5625 }, { "clip_ratio": 0.0, "completion_length": 121.84375, "epoch": 3.1197301854974704, "grad_norm": 9.527701121732772, "kl": 0.642578125, "learning_rate": 3.7706576728499157e-07, "loss": 0.0006, "reward": 2.6779627799987793, "reward_std": 0.21313096582889557, "rewards/final_reward": 0.5526819332576769, "rewards/mask_iou_reward": 0.27634096662883845, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6779626905918121, "rewards/thk_ans_format_reward": 1.0, "step": 1847, "think_completion_length": 44.59375 }, { "clip_ratio": 0.0, "completion_length": 119.3125, "epoch": 3.12141652613828, "grad_norm": 17.793087006055625, "kl": 0.5703125, "learning_rate": 3.7672849915682965e-07, "loss": 0.0006, "reward": 3.570667028427124, "reward_std": 0.1760418675839901, "rewards/final_reward": 1.8226773166599215, "rewards/mask_iou_reward": 0.9113386583299607, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5706669092178345, "rewards/thk_ans_format_reward": 1.0, "step": 1848, "think_completion_length": 51.59375 }, { "clip_ratio": 0.0, "completion_length": 120.234375, "epoch": 3.123102866779089, "grad_norm": 6.947591571261172, "kl": 0.58984375, "learning_rate": 3.7639123102866774e-07, "loss": 0.0006, "reward": 3.3724918365478516, "reward_std": 0.1368257123976946, "rewards/final_reward": 1.3695020442669015, "rewards/mask_iou_reward": 0.6847510221334507, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3724918365478516, "rewards/thk_ans_format_reward": 1.0, "step": 1849, "think_completion_length": 47.78125 }, { "clip_ratio": 0.0, "completion_length": 113.984375, "epoch": 3.124789207419899, "grad_norm": 9.58505081019304, "kl": 0.7265625, "learning_rate": 3.760539629005059e-07, "loss": 0.0007, "reward": 3.7754982709884644, "reward_std": 0.03654424054548144, "rewards/final_reward": 1.6380196415780146, "rewards/mask_iou_reward": 0.8190098207890073, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7754983305931091, "rewards/thk_ans_format_reward": 1.0, "step": 1850, "think_completion_length": 40.5 }, { "clip_ratio": 0.0, "completion_length": 115.5625, "epoch": 3.126475548060708, "grad_norm": 5.781252685852134, "kl": 0.5390625, "learning_rate": 3.75716694772344e-07, "loss": 0.0005, "reward": 3.590414047241211, "reward_std": 0.09510018303990364, "rewards/final_reward": 1.7070859122080297, "rewards/mask_iou_reward": 0.8535429561040149, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.590414047241211, "rewards/thk_ans_format_reward": 1.0, "step": 1851, "think_completion_length": 42.5 }, { "clip_ratio": 0.0, "completion_length": 162.421875, "epoch": 3.1281618887015177, "grad_norm": 11.536011446857135, "kl": 0.5126953125, "learning_rate": 3.753794266441821e-07, "loss": 0.0005, "reward": 3.3813726902008057, "reward_std": 0.19809474796056747, "rewards/final_reward": 0.9778288256741281, "rewards/mask_iou_reward": 0.48891441283706405, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3813725113868713, "rewards/thk_ans_format_reward": 1.0, "step": 1852, "think_completion_length": 39.375 }, { "clip_ratio": 0.0, "completion_length": 147.390625, "epoch": 3.1298482293423273, "grad_norm": 10.820383708545043, "kl": 0.62109375, "learning_rate": 3.7504215851602025e-07, "loss": 0.0006, "reward": 3.6570252180099487, "reward_std": 0.21269061416387558, "rewards/final_reward": 1.6539371430881022, "rewards/mask_iou_reward": 0.8269685715440511, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6570250988006592, "rewards/thk_ans_format_reward": 1.0, "step": 1853, "think_completion_length": 48.4375 }, { "clip_ratio": 0.0, "completion_length": 170.109375, "epoch": 3.1315345699831365, "grad_norm": 170.8577386926615, "kl": 0.705078125, "learning_rate": 3.7470489038785834e-07, "loss": 0.0007, "reward": 3.6504067182540894, "reward_std": 0.07961778342723846, "rewards/final_reward": 1.6228600793158108, "rewards/mask_iou_reward": 0.8114300396579054, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6504066586494446, "rewards/thk_ans_format_reward": 1.0, "step": 1854, "think_completion_length": 47.78125 }, { "clip_ratio": 0.0, "completion_length": 110.484375, "epoch": 3.133220910623946, "grad_norm": 20.313002252551094, "kl": 0.580078125, "learning_rate": 3.743676222596964e-07, "loss": 0.0006, "reward": 3.3023815155029297, "reward_std": 0.1649649254977703, "rewards/final_reward": 1.058155867642932, "rewards/mask_iou_reward": 0.529077933821466, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3023815155029297, "rewards/thk_ans_format_reward": 1.0, "step": 1855, "think_completion_length": 46.84375 }, { "clip_ratio": 0.0, "completion_length": 219.25, "epoch": 3.1349072512647553, "grad_norm": 5.257598384804945, "kl": 0.4658203125, "learning_rate": 3.7403035413153456e-07, "loss": 0.0005, "reward": 3.3379251956939697, "reward_std": 0.20651425421237946, "rewards/final_reward": 1.2870029569744763, "rewards/mask_iou_reward": 0.6435014784872382, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.337925374507904, "rewards/thk_ans_format_reward": 1.0, "step": 1856, "think_completion_length": 45.90625 }, { "clip_ratio": 0.0, "completion_length": 149.46875, "epoch": 3.136593591905565, "grad_norm": 10.768937765332302, "kl": 0.474609375, "learning_rate": 3.736930860033727e-07, "loss": 0.0005, "reward": 3.6509578227996826, "reward_std": 0.31961746513843536, "rewards/final_reward": 1.6596256594304695, "rewards/mask_iou_reward": 0.8298128297152347, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.6665828227996826, "rewards/thk_ans_format_reward": 1.0, "step": 1857, "think_completion_length": 46.40625 }, { "clip_ratio": 0.0, "completion_length": 107.6875, "epoch": 3.138279932546374, "grad_norm": 9.965618795679344, "kl": 0.544921875, "learning_rate": 3.7335581787521074e-07, "loss": 0.0005, "reward": 3.0416864156723022, "reward_std": 0.19793753325939178, "rewards/final_reward": 1.1762378400742732, "rewards/mask_iou_reward": 0.5881189200371366, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0416864231228828, "rewards/thk_ans_format_reward": 1.0, "step": 1858, "think_completion_length": 47.09375 }, { "clip_ratio": 0.0, "completion_length": 119.6875, "epoch": 3.139966273187184, "grad_norm": 8.233677040831926, "kl": 0.5703125, "learning_rate": 3.730185497470489e-07, "loss": 0.0006, "reward": 3.6706053018569946, "reward_std": 0.10431353002786636, "rewards/final_reward": 1.8973720981450843, "rewards/mask_iou_reward": 0.9486860490725422, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6706053018569946, "rewards/thk_ans_format_reward": 1.0, "step": 1859, "think_completion_length": 41.96875 }, { "clip_ratio": 0.0, "completion_length": 162.828125, "epoch": 3.1416526138279934, "grad_norm": 12.952010142511586, "kl": 0.66796875, "learning_rate": 3.72681281618887e-07, "loss": 0.0007, "reward": 3.4579014778137207, "reward_std": 0.23089369386434555, "rewards/final_reward": 1.597618762950591, "rewards/mask_iou_reward": 0.7988093814752955, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4579015374183655, "rewards/thk_ans_format_reward": 1.0, "step": 1860, "think_completion_length": 45.03125 }, { "clip_ratio": 0.0, "completion_length": 168.640625, "epoch": 3.1433389544688026, "grad_norm": 9.221455733853075, "kl": 0.583984375, "learning_rate": 3.723440134907251e-07, "loss": 0.0006, "reward": 3.3412728309631348, "reward_std": 0.387121319770813, "rewards/final_reward": 1.2192848265220455, "rewards/mask_iou_reward": 0.6096424132610228, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.3725228309631348, "rewards/thk_ans_format_reward": 0.984375, "step": 1861, "think_completion_length": 43.65625 }, { "clip_ratio": 0.0, "completion_length": 189.859375, "epoch": 3.1450252951096123, "grad_norm": 10.246420327009988, "kl": 0.5166015625, "learning_rate": 3.720067453625632e-07, "loss": 0.0005, "reward": 3.467332124710083, "reward_std": 0.11898842453956604, "rewards/final_reward": 1.3554770121678938, "rewards/mask_iou_reward": 0.6777385060839469, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4673320055007935, "rewards/thk_ans_format_reward": 1.0, "step": 1862, "think_completion_length": 42.71875 }, { "clip_ratio": 0.0, "completion_length": 126.703125, "epoch": 3.1467116357504215, "grad_norm": 4.853357004136349, "kl": 0.54296875, "learning_rate": 3.7166947723440133e-07, "loss": 0.0006, "reward": 3.306196928024292, "reward_std": 0.26819103956222534, "rewards/final_reward": 1.0140351818526023, "rewards/mask_iou_reward": 0.5070175909263012, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3061969578266144, "rewards/thk_ans_format_reward": 1.0, "step": 1863, "think_completion_length": 47.15625 }, { "clip_ratio": 0.0, "completion_length": 142.6875, "epoch": 3.148397976391231, "grad_norm": 15.97786230847862, "kl": 0.814453125, "learning_rate": 3.713322091062395e-07, "loss": 0.0008, "reward": 3.4921233654022217, "reward_std": 0.1628289446234703, "rewards/final_reward": 1.8660033928082365, "rewards/mask_iou_reward": 0.9330016964041182, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4921232461929321, "rewards/thk_ans_format_reward": 1.0, "step": 1864, "think_completion_length": 46.09375 }, { "clip_ratio": 0.0, "completion_length": 128.28125, "epoch": 3.1500843170320403, "grad_norm": 8.047713030230051, "kl": 0.609375, "learning_rate": 3.7099494097807756e-07, "loss": 0.0006, "reward": 3.2138736248016357, "reward_std": 0.040225003845989704, "rewards/final_reward": 0.9726650117579593, "rewards/mask_iou_reward": 0.48633250587897964, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2138735353946686, "rewards/thk_ans_format_reward": 1.0, "step": 1865, "think_completion_length": 43.84375 }, { "clip_ratio": 0.0, "completion_length": 104.09375, "epoch": 3.15177065767285, "grad_norm": 16.52763285766392, "kl": 0.62109375, "learning_rate": 3.7065767284991565e-07, "loss": 0.0006, "reward": 3.612211227416992, "reward_std": 0.21964190807193518, "rewards/final_reward": 1.6254546480444538, "rewards/mask_iou_reward": 0.8127273240222269, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6122112274169922, "rewards/thk_ans_format_reward": 1.0, "step": 1866, "think_completion_length": 43.125 }, { "clip_ratio": 0.0, "completion_length": 120.71875, "epoch": 3.1534569983136596, "grad_norm": 4.863586737187257, "kl": 0.91015625, "learning_rate": 3.703204047217538e-07, "loss": 0.0009, "reward": 3.467667579650879, "reward_std": 0.1343383565545082, "rewards/final_reward": 1.3809162033320566, "rewards/mask_iou_reward": 0.6904581016660283, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.467667579650879, "rewards/thk_ans_format_reward": 1.0, "step": 1867, "think_completion_length": 42.6875 }, { "clip_ratio": 0.0, "completion_length": 147.46875, "epoch": 3.1551433389544687, "grad_norm": 8.785152209092196, "kl": 0.53515625, "learning_rate": 3.699831365935919e-07, "loss": 0.0005, "reward": 3.3410245180130005, "reward_std": 0.19761168956756592, "rewards/final_reward": 1.7634354339104432, "rewards/mask_iou_reward": 0.8817177169552216, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3410245180130005, "rewards/thk_ans_format_reward": 1.0, "step": 1868, "think_completion_length": 40.78125 }, { "clip_ratio": 0.0, "completion_length": 116.5625, "epoch": 3.1568296795952784, "grad_norm": 19.595912000538206, "kl": 0.587890625, "learning_rate": 3.6964586846543e-07, "loss": 0.0006, "reward": 2.9621787071228027, "reward_std": 0.1222074730321765, "rewards/final_reward": 0.841697455205978, "rewards/mask_iou_reward": 0.420848727602989, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9621787965297699, "rewards/thk_ans_format_reward": 1.0, "step": 1869, "think_completion_length": 49.625 }, { "clip_ratio": 0.0, "completion_length": 116.859375, "epoch": 3.1585160202360876, "grad_norm": 9.419372854331563, "kl": 0.578125, "learning_rate": 3.693086003372681e-07, "loss": 0.0006, "reward": 3.422818660736084, "reward_std": 0.2237471342086792, "rewards/final_reward": 1.283719321853805, "rewards/mask_iou_reward": 0.6418596609269025, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4228186011314392, "rewards/thk_ans_format_reward": 1.0, "step": 1870, "think_completion_length": 48.65625 }, { "clip_ratio": 0.0, "completion_length": 116.4375, "epoch": 3.160202360876897, "grad_norm": 7.825248544089704, "kl": 0.86328125, "learning_rate": 3.689713322091062e-07, "loss": 0.0009, "reward": 3.4461352825164795, "reward_std": 0.1912602037191391, "rewards/final_reward": 1.5204806993205606, "rewards/mask_iou_reward": 0.7602403496602803, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.44613516330719, "rewards/thk_ans_format_reward": 1.0, "step": 1871, "think_completion_length": 49.5625 }, { "clip_ratio": 0.0, "completion_length": 184.90625, "epoch": 3.1618887015177064, "grad_norm": 6.866246401018885, "kl": 0.5703125, "learning_rate": 3.6863406408094433e-07, "loss": 0.0006, "reward": 2.7474160194396973, "reward_std": 0.17037902772426605, "rewards/final_reward": 0.9670391487885153, "rewards/mask_iou_reward": 0.48351957439425763, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7474161013960838, "rewards/thk_ans_format_reward": 1.0, "step": 1872, "think_completion_length": 43.96875 }, { "clip_ratio": 0.0, "completion_length": 121.84375, "epoch": 3.163575042158516, "grad_norm": 14.437614062731507, "kl": 0.552734375, "learning_rate": 3.6829679595278247e-07, "loss": 0.0005, "reward": 3.483241558074951, "reward_std": 0.3002520129084587, "rewards/final_reward": 1.7079844480126627, "rewards/mask_iou_reward": 0.8539922240063313, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.483241617679596, "rewards/thk_ans_format_reward": 1.0, "step": 1873, "think_completion_length": 49.90625 }, { "clip_ratio": 0.0, "completion_length": 112.53125, "epoch": 3.1652613827993257, "grad_norm": 7.261877769797301, "kl": 0.6015625, "learning_rate": 3.679595278246205e-07, "loss": 0.0006, "reward": 3.2580798864364624, "reward_std": 0.11684287153184414, "rewards/final_reward": 1.677791107797584, "rewards/mask_iou_reward": 0.838895553898792, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2580798864364624, "rewards/thk_ans_format_reward": 1.0, "step": 1874, "think_completion_length": 43.34375 }, { "clip_ratio": 0.0, "completion_length": 116.859375, "epoch": 3.166947723440135, "grad_norm": 6.984406291862621, "kl": 0.666015625, "learning_rate": 3.6762225969645865e-07, "loss": 0.0007, "reward": 3.373793601989746, "reward_std": 0.1073999097570777, "rewards/final_reward": 1.5053395216827492, "rewards/mask_iou_reward": 0.7526697608413746, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3737936615943909, "rewards/thk_ans_format_reward": 1.0, "step": 1875, "think_completion_length": 43.46875 }, { "clip_ratio": 0.0, "completion_length": 180.265625, "epoch": 3.1686340640809445, "grad_norm": 12.918770229866686, "kl": 0.5078125, "learning_rate": 3.672849915682968e-07, "loss": 0.0005, "reward": 2.6261537075042725, "reward_std": 0.35085177421569824, "rewards/final_reward": 0.20398478661733804, "rewards/mask_iou_reward": 0.10199239330866902, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6261536031961441, "rewards/thk_ans_format_reward": 1.0, "step": 1876, "think_completion_length": 46.28125 }, { "clip_ratio": 0.0, "completion_length": 149.390625, "epoch": 3.1703204047217537, "grad_norm": 7.044151304726128, "kl": 0.5234375, "learning_rate": 3.6694772344013493e-07, "loss": 0.0005, "reward": 3.3403379917144775, "reward_std": 0.09493143483996391, "rewards/final_reward": 1.0117943134491862, "rewards/mask_iou_reward": 0.5058971567245931, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3403378129005432, "rewards/thk_ans_format_reward": 1.0, "step": 1877, "think_completion_length": 44.625 }, { "clip_ratio": 0.0, "completion_length": 127.21875, "epoch": 3.1720067453625633, "grad_norm": 7.004275452012231, "kl": 0.5185546875, "learning_rate": 3.6661045531197296e-07, "loss": 0.0004, "reward": 3.5764840841293335, "reward_std": 0.18109191954135895, "rewards/final_reward": 1.7178301548804966, "rewards/mask_iou_reward": 0.8589150774402483, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5764840841293335, "rewards/thk_ans_format_reward": 1.0, "step": 1878, "think_completion_length": 49.375 }, { "clip_ratio": 0.0, "completion_length": 114.4375, "epoch": 3.1736930860033725, "grad_norm": 7.125834124063926, "kl": 0.609375, "learning_rate": 3.662731871838111e-07, "loss": 0.0006, "reward": 3.4124138355255127, "reward_std": 0.24705388210713863, "rewards/final_reward": 0.9846058681887829, "rewards/mask_iou_reward": 0.49230293409439146, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4124139547348022, "rewards/thk_ans_format_reward": 1.0, "step": 1879, "think_completion_length": 49.875 }, { "clip_ratio": 0.0, "completion_length": 114.84375, "epoch": 3.175379426644182, "grad_norm": 9.54853842110885, "kl": 0.69140625, "learning_rate": 3.6593591905564924e-07, "loss": 0.0007, "reward": 3.36881947517395, "reward_std": 0.2934834212064743, "rewards/final_reward": 1.6325663245336783, "rewards/mask_iou_reward": 0.8162831622668392, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3688194751739502, "rewards/thk_ans_format_reward": 1.0, "step": 1880, "think_completion_length": 41.0625 }, { "clip_ratio": 0.0, "completion_length": 122.40625, "epoch": 3.177065767284992, "grad_norm": 16.811266466139205, "kl": 0.58203125, "learning_rate": 3.6559865092748733e-07, "loss": 0.0006, "reward": 3.3678882122039795, "reward_std": 0.1498733222251758, "rewards/final_reward": 1.533214750984277, "rewards/mask_iou_reward": 0.7666073754921385, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.367888331413269, "rewards/thk_ans_format_reward": 1.0, "step": 1881, "think_completion_length": 48.28125 }, { "clip_ratio": 0.0, "completion_length": 116.203125, "epoch": 3.178752107925801, "grad_norm": 20.32912750334905, "kl": 0.609375, "learning_rate": 3.6526138279932547e-07, "loss": 0.0006, "reward": 3.725816488265991, "reward_std": 0.07194982096552849, "rewards/final_reward": 1.869411332281031, "rewards/mask_iou_reward": 0.9347056661405155, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7258164882659912, "rewards/thk_ans_format_reward": 1.0, "step": 1882, "think_completion_length": 42.90625 }, { "clip_ratio": 0.0, "completion_length": 134.078125, "epoch": 3.1804384485666106, "grad_norm": 13.022071736423868, "kl": 0.560546875, "learning_rate": 3.6492411467116356e-07, "loss": 0.0006, "reward": 3.3680756092071533, "reward_std": 0.164290108717978, "rewards/final_reward": 1.1868072526412312, "rewards/mask_iou_reward": 0.5934036263206156, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.368075668811798, "rewards/thk_ans_format_reward": 1.0, "step": 1883, "think_completion_length": 47.875 }, { "clip_ratio": 0.0, "completion_length": 126.390625, "epoch": 3.18212478920742, "grad_norm": 17.883421271785252, "kl": 0.56640625, "learning_rate": 3.6458684654300165e-07, "loss": 0.0006, "reward": 3.207452654838562, "reward_std": 0.4637444317340851, "rewards/final_reward": 1.2528438943904214, "rewards/mask_iou_reward": 0.6264219471952107, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2074525952339172, "rewards/thk_ans_format_reward": 1.0, "step": 1884, "think_completion_length": 51.1875 }, { "clip_ratio": 0.0, "completion_length": 114.796875, "epoch": 3.1838111298482294, "grad_norm": 15.091080800093014, "kl": 0.578125, "learning_rate": 3.642495784148398e-07, "loss": 0.0006, "reward": 2.940601348876953, "reward_std": 0.13293109834194183, "rewards/final_reward": 0.568362582486542, "rewards/mask_iou_reward": 0.284181291243271, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9406013190746307, "rewards/thk_ans_format_reward": 1.0, "step": 1885, "think_completion_length": 49.21875 }, { "clip_ratio": 0.0, "completion_length": 135.46875, "epoch": 3.1854974704890386, "grad_norm": 7.057374034683764, "kl": 0.541015625, "learning_rate": 3.6391231028667793e-07, "loss": 0.0005, "reward": 3.1076853275299072, "reward_std": 0.07365784235298634, "rewards/final_reward": 1.326237704850417, "rewards/mask_iou_reward": 0.6631188524252085, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1076853275299072, "rewards/thk_ans_format_reward": 1.0, "step": 1886, "think_completion_length": 45.34375 }, { "clip_ratio": 0.0, "completion_length": 118.125, "epoch": 3.1871838111298483, "grad_norm": 10.782989362018533, "kl": 0.5703125, "learning_rate": 3.6357504215851596e-07, "loss": 0.0006, "reward": 3.658339738845825, "reward_std": 0.015222079586237669, "rewards/final_reward": 1.7016695886711926, "rewards/mask_iou_reward": 0.8508347943355963, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6583398580551147, "rewards/thk_ans_format_reward": 1.0, "step": 1887, "think_completion_length": 47.25 }, { "clip_ratio": 0.0, "completion_length": 125.734375, "epoch": 3.1888701517706575, "grad_norm": 16.208059848588775, "kl": 0.884765625, "learning_rate": 3.632377740303541e-07, "loss": 0.0009, "reward": 3.001835823059082, "reward_std": 0.17691810801625252, "rewards/final_reward": 0.9908187527331923, "rewards/mask_iou_reward": 0.49540937636659615, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0018357634544373, "rewards/thk_ans_format_reward": 1.0, "step": 1888, "think_completion_length": 46.25 }, { "clip_ratio": 0.0, "completion_length": 117.8125, "epoch": 3.190556492411467, "grad_norm": 8.666552629160558, "kl": 0.544921875, "learning_rate": 3.6290050590219224e-07, "loss": 0.0005, "reward": 3.2973833084106445, "reward_std": 0.20763015747070312, "rewards/final_reward": 1.6951685637802774, "rewards/mask_iou_reward": 0.8475842818901387, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2973833084106445, "rewards/thk_ans_format_reward": 1.0, "step": 1889, "think_completion_length": 50.375 }, { "clip_ratio": 0.0, "completion_length": 117.125, "epoch": 3.1922428330522767, "grad_norm": 8.026625405899852, "kl": 0.5546875, "learning_rate": 3.625632377740304e-07, "loss": 0.0006, "reward": 3.849402904510498, "reward_std": 0.04095839988440275, "rewards/final_reward": 1.8241999084811233, "rewards/mask_iou_reward": 0.9120999542405617, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.849402904510498, "rewards/thk_ans_format_reward": 1.0, "step": 1890, "think_completion_length": 46.4375 }, { "clip_ratio": 0.0, "completion_length": 124.34375, "epoch": 3.193929173693086, "grad_norm": 12.34817957993866, "kl": 0.6015625, "learning_rate": 3.622259696458684e-07, "loss": 0.0006, "reward": 3.621413826942444, "reward_std": 0.04138875612989068, "rewards/final_reward": 1.8486262133262046, "rewards/mask_iou_reward": 0.9243131066631023, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6214138269424438, "rewards/thk_ans_format_reward": 1.0, "step": 1891, "think_completion_length": 47.28125 }, { "clip_ratio": 0.0, "completion_length": 116.765625, "epoch": 3.1956155143338956, "grad_norm": 23.116554236485523, "kl": 0.572265625, "learning_rate": 3.6188870151770656e-07, "loss": 0.0006, "reward": 3.0774978399276733, "reward_std": 0.01694110711105168, "rewards/final_reward": 0.7942402422055221, "rewards/mask_iou_reward": 0.39712012110276107, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0774978697299957, "rewards/thk_ans_format_reward": 1.0, "step": 1892, "think_completion_length": 46.78125 }, { "clip_ratio": 0.0, "completion_length": 114.921875, "epoch": 3.1973018549747048, "grad_norm": 46.81499159655645, "kl": 0.625, "learning_rate": 3.615514333895447e-07, "loss": 0.0006, "reward": 3.678188443183899, "reward_std": 0.11049404554069042, "rewards/final_reward": 1.4534138323408698, "rewards/mask_iou_reward": 0.7267069161704349, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6781885027885437, "rewards/thk_ans_format_reward": 1.0, "step": 1893, "think_completion_length": 40.90625 }, { "clip_ratio": 0.0, "completion_length": 120.65625, "epoch": 3.1989881956155144, "grad_norm": 7.208762040832486, "kl": 0.650390625, "learning_rate": 3.612141652613828e-07, "loss": 0.0007, "reward": 3.148299217224121, "reward_std": 0.14013096690177917, "rewards/final_reward": 1.7720664262952694, "rewards/mask_iou_reward": 0.8860332131476347, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1482991874217987, "rewards/thk_ans_format_reward": 1.0, "step": 1894, "think_completion_length": 51.03125 }, { "clip_ratio": 0.0, "completion_length": 117.59375, "epoch": 3.2006745362563236, "grad_norm": 20.349059929554834, "kl": 0.701171875, "learning_rate": 3.6087689713322087e-07, "loss": 0.0007, "reward": 3.4096421003341675, "reward_std": 0.020920042879879475, "rewards/final_reward": 1.6369392328477477, "rewards/mask_iou_reward": 0.8184696164238738, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4096421599388123, "rewards/thk_ans_format_reward": 1.0, "step": 1895, "think_completion_length": 51.6875 }, { "clip_ratio": 0.0, "completion_length": 263.734375, "epoch": 3.2023608768971332, "grad_norm": 12.582295871610476, "kl": 0.4169921875, "learning_rate": 3.60539629005059e-07, "loss": 0.0004, "reward": 2.980480432510376, "reward_std": 0.2008163258433342, "rewards/final_reward": 1.269321376141052, "rewards/mask_iou_reward": 0.634660688070526, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.0117304623126984, "rewards/thk_ans_format_reward": 0.984375, "step": 1896, "think_completion_length": 52.90625 }, { "clip_ratio": 0.0, "completion_length": 116.765625, "epoch": 3.204047217537943, "grad_norm": 9.450852600185378, "kl": 0.5546875, "learning_rate": 3.602023608768971e-07, "loss": 0.0005, "reward": 3.686271071434021, "reward_std": 0.06225780211389065, "rewards/final_reward": 1.4150984738944112, "rewards/mask_iou_reward": 0.7075492369472056, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6862711906433105, "rewards/thk_ans_format_reward": 1.0, "step": 1897, "think_completion_length": 49.15625 }, { "clip_ratio": 0.0, "completion_length": 118.140625, "epoch": 3.205733558178752, "grad_norm": 24.499457257777063, "kl": 0.74609375, "learning_rate": 3.5986509274873524e-07, "loss": 0.0008, "reward": 3.770930767059326, "reward_std": 0.036563062109053135, "rewards/final_reward": 1.723879682408082, "rewards/mask_iou_reward": 0.861939841204041, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.770930826663971, "rewards/thk_ans_format_reward": 1.0, "step": 1898, "think_completion_length": 46.1875 }, { "clip_ratio": 0.0, "completion_length": 230.359375, "epoch": 3.2074198988195617, "grad_norm": 25.75726985246122, "kl": 0.50390625, "learning_rate": 3.5952782462057333e-07, "loss": 0.0005, "reward": 3.2525272369384766, "reward_std": 0.3233413156121969, "rewards/final_reward": 1.0621304009366679, "rewards/mask_iou_reward": 0.5310652004683339, "rewards/sam_format_reward": 0.921875, "rewards/sam_reward_func_ultra": 1.4087771773338318, "rewards/thk_ans_format_reward": 0.921875, "step": 1899, "think_completion_length": 47.5625 }, { "clip_ratio": 0.0, "completion_length": 117.859375, "epoch": 3.209106239460371, "grad_norm": 7.243676204871831, "kl": 0.525390625, "learning_rate": 3.5919055649241147e-07, "loss": 0.0005, "reward": 3.7162941694259644, "reward_std": 0.11420441046357155, "rewards/final_reward": 1.7875524891500199, "rewards/mask_iou_reward": 0.8937762445750099, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7162942290306091, "rewards/thk_ans_format_reward": 1.0, "step": 1900, "think_completion_length": 48.15625 }, { "clip_ratio": 0.0, "completion_length": 116.4375, "epoch": 3.2107925801011805, "grad_norm": 15.670700195164919, "kl": 0.64453125, "learning_rate": 3.5885328836424955e-07, "loss": 0.0006, "reward": 2.7376617193222046, "reward_std": 0.0967277530580759, "rewards/final_reward": 0.6101376436250409, "rewards/mask_iou_reward": 0.30506882181252043, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7376617193222046, "rewards/thk_ans_format_reward": 1.0, "step": 1901, "think_completion_length": 42.8125 }, { "clip_ratio": 0.0, "completion_length": 119.203125, "epoch": 3.2124789207419897, "grad_norm": 9.917814521267253, "kl": 0.828125, "learning_rate": 3.585160202360877e-07, "loss": 0.0008, "reward": 3.349041223526001, "reward_std": 0.11464390531182289, "rewards/final_reward": 1.6008897912950604, "rewards/mask_iou_reward": 0.8004448956475302, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.349041372537613, "rewards/thk_ans_format_reward": 1.0, "step": 1902, "think_completion_length": 47.03125 }, { "clip_ratio": 0.0, "completion_length": 132.0625, "epoch": 3.2141652613827993, "grad_norm": 13.716296122192867, "kl": 0.482421875, "learning_rate": 3.581787521079258e-07, "loss": 0.0005, "reward": 3.7791026830673218, "reward_std": 0.16377420909702778, "rewards/final_reward": 1.7674922340875896, "rewards/mask_iou_reward": 0.8837461170437948, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7791024446487427, "rewards/thk_ans_format_reward": 1.0, "step": 1903, "think_completion_length": 47.25 }, { "clip_ratio": 0.0, "completion_length": 315.265625, "epoch": 3.2158516020236085, "grad_norm": 6.917382991288502, "kl": 0.4609375, "learning_rate": 3.5784148397976387e-07, "loss": 0.0005, "reward": 3.2332409620285034, "reward_std": 0.11346263438463211, "rewards/final_reward": 1.6959626416145304, "rewards/mask_iou_reward": 0.8479813208072652, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2332409620285034, "rewards/thk_ans_format_reward": 1.0, "step": 1904, "think_completion_length": 39.4375 }, { "clip_ratio": 0.0, "completion_length": 152.40625, "epoch": 3.217537942664418, "grad_norm": 6.471546535689695, "kl": 0.4716796875, "learning_rate": 3.57504215851602e-07, "loss": 0.0005, "reward": 3.0267739295959473, "reward_std": 0.02534060279140249, "rewards/final_reward": 1.7786319183385815, "rewards/mask_iou_reward": 0.8893159591692907, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.026773989200592, "rewards/thk_ans_format_reward": 1.0, "step": 1905, "think_completion_length": 45.9375 }, { "clip_ratio": 0.0, "completion_length": 117.953125, "epoch": 3.219224283305228, "grad_norm": 10.204264759824875, "kl": 0.5068359375, "learning_rate": 3.5716694772344015e-07, "loss": 0.0005, "reward": 2.850593686103821, "reward_std": 0.19878476485610008, "rewards/final_reward": 1.4631749075644072, "rewards/mask_iou_reward": 0.7315874537822036, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8505937159061432, "rewards/thk_ans_format_reward": 1.0, "step": 1906, "think_completion_length": 48.1875 }, { "clip_ratio": 0.0, "completion_length": 120.6875, "epoch": 3.220910623946037, "grad_norm": 4.331052028511863, "kl": 0.60546875, "learning_rate": 3.568296795952782e-07, "loss": 0.0006, "reward": 2.796687960624695, "reward_std": 0.14800949243362993, "rewards/final_reward": 0.0, "rewards/mask_iou_reward": 0.0, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7966879308223724, "rewards/thk_ans_format_reward": 1.0, "step": 1907, "think_completion_length": 53.125 }, { "clip_ratio": 0.0, "completion_length": 120.5625, "epoch": 3.2225969645868466, "grad_norm": 12.014305801190648, "kl": 0.5625, "learning_rate": 3.564924114671163e-07, "loss": 0.0006, "reward": 3.552351474761963, "reward_std": 0.1006348617374897, "rewards/final_reward": 1.178661954997034, "rewards/mask_iou_reward": 0.589330977498517, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.552351474761963, "rewards/thk_ans_format_reward": 1.0, "step": 1908, "think_completion_length": 45.28125 }, { "clip_ratio": 0.0, "completion_length": 141.8125, "epoch": 3.224283305227656, "grad_norm": 21.494166836430573, "kl": 0.5, "learning_rate": 3.5615514333895447e-07, "loss": 0.0005, "reward": 3.580656051635742, "reward_std": 0.1924985572695732, "rewards/final_reward": 1.5555895783528935, "rewards/mask_iou_reward": 0.7777947891764467, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.580656111240387, "rewards/thk_ans_format_reward": 1.0, "step": 1909, "think_completion_length": 41.9375 }, { "clip_ratio": 0.0, "completion_length": 116.109375, "epoch": 3.2259696458684655, "grad_norm": 9.042788517181418, "kl": 0.55859375, "learning_rate": 3.5581787521079255e-07, "loss": 0.0006, "reward": 3.6765564680099487, "reward_std": 0.15247973427176476, "rewards/final_reward": 1.6539351637884079, "rewards/mask_iou_reward": 0.8269675818942039, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6765565276145935, "rewards/thk_ans_format_reward": 1.0, "step": 1910, "think_completion_length": 44.09375 }, { "clip_ratio": 0.0, "completion_length": 117.03125, "epoch": 3.2276559865092747, "grad_norm": 30.946422055169762, "kl": 0.69140625, "learning_rate": 3.554806070826307e-07, "loss": 0.0007, "reward": 3.311057448387146, "reward_std": 0.05010443180799484, "rewards/final_reward": 1.2626555786327738, "rewards/mask_iou_reward": 0.6313277893163869, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3110575675964355, "rewards/thk_ans_format_reward": 1.0, "step": 1911, "think_completion_length": 46.0625 }, { "clip_ratio": 0.0, "completion_length": 116.546875, "epoch": 3.2293423271500843, "grad_norm": 15.95198649741166, "kl": 0.572265625, "learning_rate": 3.551433389544688e-07, "loss": 0.0006, "reward": 3.87019944190979, "reward_std": 0.008461029967293143, "rewards/final_reward": 1.8844051330535168, "rewards/mask_iou_reward": 0.9422025665267584, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8701993823051453, "rewards/thk_ans_format_reward": 1.0, "step": 1912, "think_completion_length": 49.71875 }, { "clip_ratio": 0.0, "completion_length": 119.3125, "epoch": 3.231028667790894, "grad_norm": 71.42783539007475, "kl": 0.587890625, "learning_rate": 3.548060708263069e-07, "loss": 0.0006, "reward": 3.5393285751342773, "reward_std": 0.0627220245078206, "rewards/final_reward": 1.3345203832878214, "rewards/mask_iou_reward": 0.6672601916439107, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5393285751342773, "rewards/thk_ans_format_reward": 1.0, "step": 1913, "think_completion_length": 42.9375 }, { "clip_ratio": 0.0, "completion_length": 227.25, "epoch": 3.232715008431703, "grad_norm": 12.793804181066932, "kl": 0.498046875, "learning_rate": 3.54468802698145e-07, "loss": 0.0005, "reward": 2.918440103530884, "reward_std": 0.39514149725437164, "rewards/final_reward": 1.2474688527428803, "rewards/mask_iou_reward": 0.6237344263714402, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 0.980940043926239, "rewards/thk_ans_format_reward": 0.96875, "step": 1914, "think_completion_length": 45.28125 }, { "clip_ratio": 0.0, "completion_length": 204.875, "epoch": 3.2344013490725128, "grad_norm": 12.041314585208962, "kl": 0.4609375, "learning_rate": 3.5413153456998315e-07, "loss": 0.0005, "reward": 3.235507011413574, "reward_std": 0.36636675521731377, "rewards/final_reward": 1.545750925604584, "rewards/mask_iou_reward": 0.772875462802292, "rewards/sam_format_reward": 0.953125, "rewards/sam_reward_func_ultra": 1.3292570114135742, "rewards/thk_ans_format_reward": 0.953125, "step": 1915, "think_completion_length": 45.8125 }, { "clip_ratio": 0.0, "completion_length": 116.78125, "epoch": 3.236087689713322, "grad_norm": 6.181907315446415, "kl": 0.58203125, "learning_rate": 3.5379426644182124e-07, "loss": 0.0006, "reward": 3.6867408752441406, "reward_std": 0.02472075680270791, "rewards/final_reward": 1.5338553636125272, "rewards/mask_iou_reward": 0.7669276818062636, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6867409944534302, "rewards/thk_ans_format_reward": 1.0, "step": 1916, "think_completion_length": 48.9375 }, { "clip_ratio": 0.0, "completion_length": 128.796875, "epoch": 3.2377740303541316, "grad_norm": 8.259989325991016, "kl": 0.62890625, "learning_rate": 3.534569983136593e-07, "loss": 0.0006, "reward": 3.3222023248672485, "reward_std": 0.10382327809929848, "rewards/final_reward": 0.95217661103699, "rewards/mask_iou_reward": 0.476088305518495, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.322202205657959, "rewards/thk_ans_format_reward": 1.0, "step": 1917, "think_completion_length": 43.78125 }, { "clip_ratio": 0.0, "completion_length": 130.703125, "epoch": 3.2394603709949408, "grad_norm": 5.195267131537693, "kl": 0.51953125, "learning_rate": 3.5311973018549746e-07, "loss": 0.0005, "reward": 3.7313687801361084, "reward_std": 0.012648439034819603, "rewards/final_reward": 1.8695688607840397, "rewards/mask_iou_reward": 0.9347844303920199, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7313688397407532, "rewards/thk_ans_format_reward": 1.0, "step": 1918, "think_completion_length": 47.8125 }, { "clip_ratio": 0.0, "completion_length": 114.890625, "epoch": 3.2411467116357504, "grad_norm": 11.689240947938602, "kl": 0.5703125, "learning_rate": 3.527824620573356e-07, "loss": 0.0006, "reward": 3.478485584259033, "reward_std": 0.030393260531127453, "rewards/final_reward": 1.292362160744934, "rewards/mask_iou_reward": 0.646181080372467, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.478485643863678, "rewards/thk_ans_format_reward": 1.0, "step": 1919, "think_completion_length": 40.3125 }, { "clip_ratio": 0.0, "completion_length": 119.171875, "epoch": 3.24283305227656, "grad_norm": 8.657308766813404, "kl": 0.6015625, "learning_rate": 3.5244519392917364e-07, "loss": 0.0006, "reward": 3.202649235725403, "reward_std": 0.23502523079514503, "rewards/final_reward": 1.0171185468343586, "rewards/mask_iou_reward": 0.5085592734171793, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2026492953300476, "rewards/thk_ans_format_reward": 1.0, "step": 1920, "think_completion_length": 44.6875 }, { "clip_ratio": 0.0, "completion_length": 117.859375, "epoch": 3.2445193929173692, "grad_norm": 6.640724330191083, "kl": 0.6328125, "learning_rate": 3.521079258010118e-07, "loss": 0.0006, "reward": 3.8660894632339478, "reward_std": 0.013371082721278071, "rewards/final_reward": 1.9243815967458078, "rewards/mask_iou_reward": 0.9621907983729039, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8660894632339478, "rewards/thk_ans_format_reward": 1.0, "step": 1921, "think_completion_length": 46.625 }, { "clip_ratio": 0.0, "completion_length": 116.421875, "epoch": 3.246205733558179, "grad_norm": 15.215046527172118, "kl": 0.810546875, "learning_rate": 3.517706576728499e-07, "loss": 0.0008, "reward": 3.2143337726593018, "reward_std": 0.247731015086174, "rewards/final_reward": 1.1496970084104503, "rewards/mask_iou_reward": 0.5748485042052252, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2143338322639465, "rewards/thk_ans_format_reward": 1.0, "step": 1922, "think_completion_length": 49.03125 }, { "clip_ratio": 0.0, "completion_length": 116.5, "epoch": 3.247892074198988, "grad_norm": 6.264739124085545, "kl": 0.55859375, "learning_rate": 3.51433389544688e-07, "loss": 0.0006, "reward": 3.7529088258743286, "reward_std": 0.008196833077818155, "rewards/final_reward": 1.7964630349270372, "rewards/mask_iou_reward": 0.8982315174635186, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7529088854789734, "rewards/thk_ans_format_reward": 1.0, "step": 1923, "think_completion_length": 50.5 }, { "clip_ratio": 0.0, "completion_length": 135.765625, "epoch": 3.2495784148397977, "grad_norm": 31.375759798091025, "kl": 0.56640625, "learning_rate": 3.510961214165261e-07, "loss": 0.0006, "reward": 3.684548854827881, "reward_std": 0.10750999674201012, "rewards/final_reward": 1.7561618903536844, "rewards/mask_iou_reward": 0.8780809451768422, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6845490336418152, "rewards/thk_ans_format_reward": 1.0, "step": 1924, "think_completion_length": 50.84375 }, { "clip_ratio": 0.0, "completion_length": 117.609375, "epoch": 3.251264755480607, "grad_norm": 5.912699538453515, "kl": 0.5703125, "learning_rate": 3.5075885328836423e-07, "loss": 0.0006, "reward": 3.3729045391082764, "reward_std": 0.0787664633244276, "rewards/final_reward": 1.3844858612820419, "rewards/mask_iou_reward": 0.6922429306410209, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3729044795036316, "rewards/thk_ans_format_reward": 1.0, "step": 1925, "think_completion_length": 48.21875 }, { "clip_ratio": 0.0, "completion_length": 119.828125, "epoch": 3.2529510961214165, "grad_norm": 43.59838934618175, "kl": 0.59765625, "learning_rate": 3.504215851602024e-07, "loss": 0.0006, "reward": 3.7652982473373413, "reward_std": 0.025617387611418962, "rewards/final_reward": 1.874484492995093, "rewards/mask_iou_reward": 0.9372422464975465, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7652981281280518, "rewards/thk_ans_format_reward": 1.0, "step": 1926, "think_completion_length": 51.8125 }, { "clip_ratio": 0.0, "completion_length": 116.859375, "epoch": 3.254637436762226, "grad_norm": 83.00771680277455, "kl": 0.5419921875, "learning_rate": 3.5008431703204046e-07, "loss": 0.0005, "reward": 3.591688632965088, "reward_std": 0.11703697592020035, "rewards/final_reward": 1.6477103820219803, "rewards/mask_iou_reward": 0.8238551910109901, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5916885137557983, "rewards/thk_ans_format_reward": 1.0, "step": 1927, "think_completion_length": 48.3125 }, { "clip_ratio": 0.0, "completion_length": 114.015625, "epoch": 3.2563237774030354, "grad_norm": 24.512476139760526, "kl": 0.580078125, "learning_rate": 3.4974704890387855e-07, "loss": 0.0006, "reward": 3.7737098932266235, "reward_std": 0.012913587968796492, "rewards/final_reward": 1.8550224840824043, "rewards/mask_iou_reward": 0.9275112420412022, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7737098336219788, "rewards/thk_ans_format_reward": 1.0, "step": 1928, "think_completion_length": 42.875 }, { "clip_ratio": 0.0, "completion_length": 116.0625, "epoch": 3.258010118043845, "grad_norm": 19.987296695281866, "kl": 0.572265625, "learning_rate": 3.494097807757167e-07, "loss": 0.0006, "reward": 3.2878096103668213, "reward_std": 0.07040636241436005, "rewards/final_reward": 1.3527816448964711, "rewards/mask_iou_reward": 0.6763908224482356, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2878096401691437, "rewards/thk_ans_format_reward": 1.0, "step": 1929, "think_completion_length": 47.84375 }, { "clip_ratio": 0.0, "completion_length": 148.015625, "epoch": 3.259696458684654, "grad_norm": 6.978729980497052, "kl": 0.4814453125, "learning_rate": 3.490725126475548e-07, "loss": 0.0005, "reward": 3.7521612644195557, "reward_std": 0.039452452678233385, "rewards/final_reward": 1.8431230070469085, "rewards/mask_iou_reward": 0.9215615035234542, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7521612048149109, "rewards/thk_ans_format_reward": 1.0, "step": 1930, "think_completion_length": 47.8125 }, { "clip_ratio": 0.0, "completion_length": 146.359375, "epoch": 3.261382799325464, "grad_norm": 10.264759231390057, "kl": 0.53515625, "learning_rate": 3.487352445193929e-07, "loss": 0.0005, "reward": 3.3465049266815186, "reward_std": 0.2511683627963066, "rewards/final_reward": 1.3293050149147185, "rewards/mask_iou_reward": 0.6646525074573593, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.346504807472229, "rewards/thk_ans_format_reward": 1.0, "step": 1931, "think_completion_length": 54.4375 }, { "clip_ratio": 0.0, "completion_length": 116.984375, "epoch": 3.263069139966273, "grad_norm": 15.61447613519973, "kl": 0.57421875, "learning_rate": 3.48397976391231e-07, "loss": 0.0006, "reward": 3.618667721748352, "reward_std": 0.03920717164874077, "rewards/final_reward": 1.7762699723621134, "rewards/mask_iou_reward": 0.8881349861810567, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6186676621437073, "rewards/thk_ans_format_reward": 1.0, "step": 1932, "think_completion_length": 46.34375 }, { "clip_ratio": 0.0, "completion_length": 123.25, "epoch": 3.2647554806070826, "grad_norm": 12.672323252058314, "kl": 0.53515625, "learning_rate": 3.480607082630691e-07, "loss": 0.0005, "reward": 3.5231988430023193, "reward_std": 0.3485229015350342, "rewards/final_reward": 1.5344188003096824, "rewards/mask_iou_reward": 0.7672094001548412, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5231987833976746, "rewards/thk_ans_format_reward": 1.0, "step": 1933, "think_completion_length": 53.1875 }, { "clip_ratio": 0.0, "completion_length": 115.5, "epoch": 3.2664418212478923, "grad_norm": 6.961605791537598, "kl": 0.544921875, "learning_rate": 3.4772344013490723e-07, "loss": 0.0006, "reward": 3.493443489074707, "reward_std": 0.015817434526979923, "rewards/final_reward": 1.3104796870689048, "rewards/mask_iou_reward": 0.6552398435344524, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4934434294700623, "rewards/thk_ans_format_reward": 1.0, "step": 1934, "think_completion_length": 46.59375 }, { "clip_ratio": 0.0, "completion_length": 143.546875, "epoch": 3.2681281618887015, "grad_norm": 10.4325957223446, "kl": 0.5546875, "learning_rate": 3.4738617200674537e-07, "loss": 0.0006, "reward": 3.409175992012024, "reward_std": 0.014063057489693165, "rewards/final_reward": 1.6345188021142831, "rewards/mask_iou_reward": 0.8172594010571416, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.409175992012024, "rewards/thk_ans_format_reward": 1.0, "step": 1935, "think_completion_length": 47.5625 }, { "clip_ratio": 0.0, "completion_length": 115.40625, "epoch": 3.269814502529511, "grad_norm": 10.917450013229887, "kl": 0.92578125, "learning_rate": 3.470489038785834e-07, "loss": 0.0009, "reward": 3.5084145069122314, "reward_std": 0.035716623067855835, "rewards/final_reward": 1.4282951391233816, "rewards/mask_iou_reward": 0.7141475695616908, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5084145665168762, "rewards/thk_ans_format_reward": 1.0, "step": 1936, "think_completion_length": 45.625 }, { "clip_ratio": 0.0, "completion_length": 227.640625, "epoch": 3.2715008431703203, "grad_norm": 11.505594870207323, "kl": 0.431640625, "learning_rate": 3.4671163575042155e-07, "loss": 0.0004, "reward": 3.2266218662261963, "reward_std": 0.1628934144973755, "rewards/final_reward": 0.6567636062319168, "rewards/mask_iou_reward": 0.3283818031159584, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2266216576099396, "rewards/thk_ans_format_reward": 1.0, "step": 1937, "think_completion_length": 49.90625 }, { "clip_ratio": 0.0, "completion_length": 163.171875, "epoch": 3.27318718381113, "grad_norm": 7.500610170611466, "kl": 0.521484375, "learning_rate": 3.463743676222597e-07, "loss": 0.0005, "reward": 3.5439982414245605, "reward_std": 0.16045394260436296, "rewards/final_reward": 1.6367190095262139, "rewards/mask_iou_reward": 0.8183595047631069, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5439982414245605, "rewards/thk_ans_format_reward": 1.0, "step": 1938, "think_completion_length": 40.0625 }, { "clip_ratio": 0.0, "completion_length": 120.25, "epoch": 3.274873524451939, "grad_norm": 12.358596568932127, "kl": 0.603515625, "learning_rate": 3.4603709949409783e-07, "loss": 0.0006, "reward": 3.4929676055908203, "reward_std": 0.14762873388826847, "rewards/final_reward": 1.5403731262583937, "rewards/mask_iou_reward": 0.7701865631291969, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4929676055908203, "rewards/thk_ans_format_reward": 1.0, "step": 1939, "think_completion_length": 51.09375 }, { "clip_ratio": 0.0, "completion_length": 120.34375, "epoch": 3.2765598650927488, "grad_norm": 7.77438392436683, "kl": 0.5205078125, "learning_rate": 3.456998313659359e-07, "loss": 0.0005, "reward": 3.4649903774261475, "reward_std": 0.054170895367860794, "rewards/final_reward": 1.2585329063507007, "rewards/mask_iou_reward": 0.6292664531753503, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4649905562400818, "rewards/thk_ans_format_reward": 1.0, "step": 1940, "think_completion_length": 51.125 }, { "clip_ratio": 0.0, "completion_length": 124.65625, "epoch": 3.2782462057335584, "grad_norm": 20.018398131695648, "kl": 0.5546875, "learning_rate": 3.45362563237774e-07, "loss": 0.0006, "reward": 3.057510256767273, "reward_std": 0.2664487063884735, "rewards/final_reward": 1.3169868227485813, "rewards/mask_iou_reward": 0.6584934113742906, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0575102269649506, "rewards/thk_ans_format_reward": 1.0, "step": 1941, "think_completion_length": 45.3125 }, { "clip_ratio": 0.0, "completion_length": 116.96875, "epoch": 3.2799325463743676, "grad_norm": 21.138072216122055, "kl": 0.556640625, "learning_rate": 3.4502529510961214e-07, "loss": 0.0006, "reward": 3.3030476570129395, "reward_std": 0.07455268129706383, "rewards/final_reward": 1.2251721436715706, "rewards/mask_iou_reward": 0.6125860718357853, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.303047776222229, "rewards/thk_ans_format_reward": 1.0, "step": 1942, "think_completion_length": 48.03125 }, { "clip_ratio": 0.0, "completion_length": 119.0625, "epoch": 3.2816188870151772, "grad_norm": 13.0634941853992, "kl": 0.603515625, "learning_rate": 3.4468802698145023e-07, "loss": 0.0006, "reward": 3.625705122947693, "reward_std": 0.19386066659353673, "rewards/final_reward": 1.756648771490428, "rewards/mask_iou_reward": 0.878324385745214, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6257051825523376, "rewards/thk_ans_format_reward": 1.0, "step": 1943, "think_completion_length": 49.9375 }, { "clip_ratio": 0.0, "completion_length": 152.46875, "epoch": 3.2833052276559864, "grad_norm": 12.687196816233865, "kl": 0.4619140625, "learning_rate": 3.4435075885328837e-07, "loss": 0.0005, "reward": 3.3297626972198486, "reward_std": 0.06611794698983431, "rewards/final_reward": 1.8108261193501756, "rewards/mask_iou_reward": 0.9054130596750878, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3297626972198486, "rewards/thk_ans_format_reward": 1.0, "step": 1944, "think_completion_length": 52.1875 }, { "clip_ratio": 0.0, "completion_length": 119.625, "epoch": 3.284991568296796, "grad_norm": 12.300140965553341, "kl": 0.5625, "learning_rate": 3.4401349072512646e-07, "loss": 0.0006, "reward": 3.381837844848633, "reward_std": 0.17056848295032978, "rewards/final_reward": 1.396779860879466, "rewards/mask_iou_reward": 0.698389930439733, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.381837785243988, "rewards/thk_ans_format_reward": 1.0, "step": 1945, "think_completion_length": 50.21875 }, { "clip_ratio": 0.0, "completion_length": 189.953125, "epoch": 3.2866779089376053, "grad_norm": 21.44964920998146, "kl": 0.474609375, "learning_rate": 3.4367622259696455e-07, "loss": 0.0005, "reward": 3.4118212461471558, "reward_std": 0.11445962265133858, "rewards/final_reward": 1.1539239197745816, "rewards/mask_iou_reward": 0.5769619598872908, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4118210673332214, "rewards/thk_ans_format_reward": 1.0, "step": 1946, "think_completion_length": 42.0625 }, { "clip_ratio": 0.0, "completion_length": 113.8125, "epoch": 3.288364249578415, "grad_norm": 37.20110020822072, "kl": 0.51171875, "learning_rate": 3.433389544688027e-07, "loss": 0.0005, "reward": 3.058582067489624, "reward_std": 0.17460413463413715, "rewards/final_reward": 1.126570511549574, "rewards/mask_iou_reward": 0.563285255774787, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0585820376873016, "rewards/thk_ans_format_reward": 1.0, "step": 1947, "think_completion_length": 49.15625 }, { "clip_ratio": 0.0, "completion_length": 135.53125, "epoch": 3.2900505902192245, "grad_norm": 8.782980390987696, "kl": 0.779296875, "learning_rate": 3.4300168634064083e-07, "loss": 0.0008, "reward": 3.141714930534363, "reward_std": 0.13721412606537342, "rewards/final_reward": 1.3629437232870845, "rewards/mask_iou_reward": 0.6814718616435422, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1417149901390076, "rewards/thk_ans_format_reward": 1.0, "step": 1948, "think_completion_length": 47.59375 }, { "clip_ratio": 0.0, "completion_length": 127.28125, "epoch": 3.2917369308600337, "grad_norm": 10.802171398935693, "kl": 0.5390625, "learning_rate": 3.4266441821247886e-07, "loss": 0.0005, "reward": 2.82895827293396, "reward_std": 0.19724398013204336, "rewards/final_reward": 0.314011618343195, "rewards/mask_iou_reward": 0.1570058091715975, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8289581835269928, "rewards/thk_ans_format_reward": 1.0, "step": 1949, "think_completion_length": 48.3125 }, { "clip_ratio": 0.0, "completion_length": 119.53125, "epoch": 3.2934232715008434, "grad_norm": 7.335381485054816, "kl": 0.64453125, "learning_rate": 3.42327150084317e-07, "loss": 0.0006, "reward": 3.3129823207855225, "reward_std": 0.05123046040534973, "rewards/final_reward": 1.3300802900736033, "rewards/mask_iou_reward": 0.6650401450368016, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3129823207855225, "rewards/thk_ans_format_reward": 1.0, "step": 1950, "think_completion_length": 51.25 }, { "clip_ratio": 0.0, "completion_length": 115.28125, "epoch": 3.2951096121416525, "grad_norm": 9.971884177129356, "kl": 0.576171875, "learning_rate": 3.4198988195615514e-07, "loss": 0.0006, "reward": 3.6915000677108765, "reward_std": 0.07537084259092808, "rewards/final_reward": 1.7635042987473266, "rewards/mask_iou_reward": 0.8817521493736633, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6915001273155212, "rewards/thk_ans_format_reward": 1.0, "step": 1951, "think_completion_length": 44.53125 }, { "clip_ratio": 0.0, "completion_length": 114.453125, "epoch": 3.296795952782462, "grad_norm": 13.312910016366194, "kl": 0.548828125, "learning_rate": 3.416526138279933e-07, "loss": 0.0006, "reward": 3.1335134506225586, "reward_std": 0.1029847264289856, "rewards/final_reward": 0.9387565138742173, "rewards/mask_iou_reward": 0.46937825693710866, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1335134506225586, "rewards/thk_ans_format_reward": 1.0, "step": 1952, "think_completion_length": 46.0 }, { "clip_ratio": 0.0, "completion_length": 130.640625, "epoch": 3.2984822934232714, "grad_norm": 5.730294323235976, "kl": 0.541015625, "learning_rate": 3.413153456998313e-07, "loss": 0.0006, "reward": 3.307882785797119, "reward_std": 0.08197947776352521, "rewards/final_reward": 0.9149686594569242, "rewards/mask_iou_reward": 0.4574843297284621, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3078828155994415, "rewards/thk_ans_format_reward": 1.0, "step": 1953, "think_completion_length": 52.4375 }, { "clip_ratio": 0.0, "completion_length": 181.03125, "epoch": 3.300168634064081, "grad_norm": 4.118069432736326, "kl": 0.52734375, "learning_rate": 3.4097807757166946e-07, "loss": 0.0005, "reward": 3.575540781021118, "reward_std": 0.09125766530632973, "rewards/final_reward": 1.7892343078399855, "rewards/mask_iou_reward": 0.8946171539199927, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5755407810211182, "rewards/thk_ans_format_reward": 1.0, "step": 1954, "think_completion_length": 41.40625 }, { "clip_ratio": 0.0, "completion_length": 119.921875, "epoch": 3.30185497470489, "grad_norm": 5.844672810729986, "kl": 0.63671875, "learning_rate": 3.406408094435076e-07, "loss": 0.0006, "reward": 3.340492367744446, "reward_std": 0.09307361952960491, "rewards/final_reward": 1.3782593471998423, "rewards/mask_iou_reward": 0.6891296735999212, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3404923677444458, "rewards/thk_ans_format_reward": 1.0, "step": 1955, "think_completion_length": 45.625 }, { "clip_ratio": 0.0, "completion_length": 114.640625, "epoch": 3.3035413153457, "grad_norm": 8.724432617546066, "kl": 0.65234375, "learning_rate": 3.403035413153457e-07, "loss": 0.0006, "reward": 3.5463435649871826, "reward_std": 0.022105058655142784, "rewards/final_reward": 1.7385080818964855, "rewards/mask_iou_reward": 0.8692540409482428, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5463436841964722, "rewards/thk_ans_format_reward": 1.0, "step": 1956, "think_completion_length": 46.5625 }, { "clip_ratio": 0.0, "completion_length": 155.671875, "epoch": 3.305227655986509, "grad_norm": 5.245602464263963, "kl": 0.4326171875, "learning_rate": 3.3996627318718377e-07, "loss": 0.0004, "reward": 3.68087375164032, "reward_std": 0.20528633147478104, "rewards/final_reward": 1.6173913805101567, "rewards/mask_iou_reward": 0.8086956902550784, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6808737516403198, "rewards/thk_ans_format_reward": 1.0, "step": 1957, "think_completion_length": 42.375 }, { "clip_ratio": 0.0, "completion_length": 143.15625, "epoch": 3.3069139966273187, "grad_norm": 7.913915419345036, "kl": 0.509765625, "learning_rate": 3.396290050590219e-07, "loss": 0.0005, "reward": 3.4271721839904785, "reward_std": 0.15432360395789146, "rewards/final_reward": 1.7015906586002267, "rewards/mask_iou_reward": 0.8507953293001134, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4271721243858337, "rewards/thk_ans_format_reward": 1.0, "step": 1958, "think_completion_length": 47.96875 }, { "clip_ratio": 0.0, "completion_length": 114.09375, "epoch": 3.3086003372681283, "grad_norm": 16.336278572012315, "kl": 0.58984375, "learning_rate": 3.3929173693086e-07, "loss": 0.0006, "reward": 3.6032882928848267, "reward_std": 0.10025950521230698, "rewards/final_reward": 1.5689837273879075, "rewards/mask_iou_reward": 0.7844918636939537, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6032883524894714, "rewards/thk_ans_format_reward": 1.0, "step": 1959, "think_completion_length": 44.40625 }, { "clip_ratio": 0.0, "completion_length": 131.25, "epoch": 3.3102866779089375, "grad_norm": 11.476881090573237, "kl": 0.52734375, "learning_rate": 3.3895446880269814e-07, "loss": 0.0005, "reward": 3.1221102476119995, "reward_std": 0.5458821058273315, "rewards/final_reward": 1.2438918568758344, "rewards/mask_iou_reward": 0.6219459284379172, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1221102476119995, "rewards/thk_ans_format_reward": 1.0, "step": 1960, "think_completion_length": 49.3125 }, { "clip_ratio": 0.0, "completion_length": 113.078125, "epoch": 3.311973018549747, "grad_norm": 7.1417455994906796, "kl": 0.53515625, "learning_rate": 3.3861720067453623e-07, "loss": 0.0005, "reward": 3.7143653631210327, "reward_std": 0.1868691765703261, "rewards/final_reward": 1.569197623422851, "rewards/mask_iou_reward": 0.7845988117114255, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7143654227256775, "rewards/thk_ans_format_reward": 1.0, "step": 1961, "think_completion_length": 44.46875 }, { "clip_ratio": 0.0, "completion_length": 116.625, "epoch": 3.3136593591905563, "grad_norm": 11.18135115326309, "kl": 0.556640625, "learning_rate": 3.382799325463743e-07, "loss": 0.0006, "reward": 3.6555434465408325, "reward_std": 0.026283076032996178, "rewards/final_reward": 1.713166532577928, "rewards/mask_iou_reward": 0.856583266288964, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.655543327331543, "rewards/thk_ans_format_reward": 1.0, "step": 1962, "think_completion_length": 47.28125 }, { "clip_ratio": 0.0, "completion_length": 117.15625, "epoch": 3.315345699831366, "grad_norm": 5.682834234008431, "kl": 0.59765625, "learning_rate": 3.3794266441821246e-07, "loss": 0.0005, "reward": 3.7232742309570312, "reward_std": 0.004197546397335827, "rewards/final_reward": 1.612368796690126, "rewards/mask_iou_reward": 0.806184398345063, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7232744693756104, "rewards/thk_ans_format_reward": 1.0, "step": 1963, "think_completion_length": 43.46875 }, { "clip_ratio": 0.0, "completion_length": 113.703125, "epoch": 3.317032040472175, "grad_norm": 17.08310617062568, "kl": 0.5625, "learning_rate": 3.376053962900506e-07, "loss": 0.0007, "reward": 3.6794604063034058, "reward_std": 0.12550297752022743, "rewards/final_reward": 1.52696537807861, "rewards/mask_iou_reward": 0.763482689039305, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6794604063034058, "rewards/thk_ans_format_reward": 1.0, "step": 1964, "think_completion_length": 46.15625 }, { "clip_ratio": 0.0, "completion_length": 116.078125, "epoch": 3.318718381112985, "grad_norm": 6.031217553059817, "kl": 0.5703125, "learning_rate": 3.3726812816188874e-07, "loss": 0.0006, "reward": 3.697144627571106, "reward_std": 0.05235449317842722, "rewards/final_reward": 1.5463581091045677, "rewards/mask_iou_reward": 0.7731790545522839, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.697144627571106, "rewards/thk_ans_format_reward": 1.0, "step": 1965, "think_completion_length": 47.125 }, { "clip_ratio": 0.0, "completion_length": 118.8125, "epoch": 3.3204047217537944, "grad_norm": 9.929998031126646, "kl": 0.580078125, "learning_rate": 3.3693086003372677e-07, "loss": 0.0006, "reward": 2.651795506477356, "reward_std": 0.13870839029550552, "rewards/final_reward": 0.679500139659625, "rewards/mask_iou_reward": 0.3397500698298125, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6517956554889679, "rewards/thk_ans_format_reward": 1.0, "step": 1966, "think_completion_length": 41.3125 }, { "clip_ratio": 0.0, "completion_length": 117.59375, "epoch": 3.3220910623946036, "grad_norm": 5.908865902855194, "kl": 0.51171875, "learning_rate": 3.365935919055649e-07, "loss": 0.0005, "reward": 3.6012171506881714, "reward_std": 0.03993457509204745, "rewards/final_reward": 1.517730822907334, "rewards/mask_iou_reward": 0.758865411453667, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6012172102928162, "rewards/thk_ans_format_reward": 1.0, "step": 1967, "think_completion_length": 47.40625 }, { "clip_ratio": 0.0, "completion_length": 143.09375, "epoch": 3.3237774030354132, "grad_norm": 6.590923867848572, "kl": 0.576171875, "learning_rate": 3.3625632377740305e-07, "loss": 0.0006, "reward": 3.4035218954086304, "reward_std": 0.15390025824308395, "rewards/final_reward": 1.7916255876053522, "rewards/mask_iou_reward": 0.8958127938026761, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4035218954086304, "rewards/thk_ans_format_reward": 1.0, "step": 1968, "think_completion_length": 45.75 }, { "clip_ratio": 0.0, "completion_length": 151.53125, "epoch": 3.3254637436762224, "grad_norm": 13.828438711649138, "kl": 0.478515625, "learning_rate": 3.3591905564924114e-07, "loss": 0.0005, "reward": 3.4149030447006226, "reward_std": 0.08157273754477501, "rewards/final_reward": 1.1635934078063273, "rewards/mask_iou_reward": 0.5817967039031636, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4149029850959778, "rewards/thk_ans_format_reward": 1.0, "step": 1969, "think_completion_length": 47.625 }, { "clip_ratio": 0.0, "completion_length": 113.84375, "epoch": 3.327150084317032, "grad_norm": 10.066970850691444, "kl": 0.595703125, "learning_rate": 3.355817875210792e-07, "loss": 0.0008, "reward": 3.6865395307540894, "reward_std": 0.07062768749892712, "rewards/final_reward": 1.8212618523933068, "rewards/mask_iou_reward": 0.9106309261966534, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6865394711494446, "rewards/thk_ans_format_reward": 1.0, "step": 1970, "think_completion_length": 47.71875 }, { "clip_ratio": 0.0, "completion_length": 183.59375, "epoch": 3.3288364249578413, "grad_norm": 8.74797620219619, "kl": 0.49609375, "learning_rate": 3.3524451939291737e-07, "loss": 0.0005, "reward": 3.249285936355591, "reward_std": 0.08715942595154047, "rewards/final_reward": 1.2155885135791953, "rewards/mask_iou_reward": 0.6077942567895976, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2492859959602356, "rewards/thk_ans_format_reward": 1.0, "step": 1971, "think_completion_length": 42.0 }, { "clip_ratio": 0.0, "completion_length": 146.671875, "epoch": 3.330522765598651, "grad_norm": 20.092660644475593, "kl": 0.609375, "learning_rate": 3.3490725126475545e-07, "loss": 0.0006, "reward": 2.717191219329834, "reward_std": 0.2924363315105438, "rewards/final_reward": 0.9926286151476688, "rewards/mask_iou_reward": 0.4963143075738344, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 0.7796911597251892, "rewards/thk_ans_format_reward": 0.96875, "step": 1972, "think_completion_length": 48.625 }, { "clip_ratio": 0.0, "completion_length": 116.03125, "epoch": 3.3322091062394605, "grad_norm": 18.654057852768332, "kl": 0.576171875, "learning_rate": 3.345699831365936e-07, "loss": 0.0006, "reward": 3.6741316318511963, "reward_std": 0.0648178979754448, "rewards/final_reward": 1.4977320756789554, "rewards/mask_iou_reward": 0.7488660378394777, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.674131691455841, "rewards/thk_ans_format_reward": 1.0, "step": 1973, "think_completion_length": 47.21875 }, { "clip_ratio": 0.0, "completion_length": 117.625, "epoch": 3.3338954468802697, "grad_norm": 6.74948481793854, "kl": 0.55859375, "learning_rate": 3.342327150084317e-07, "loss": 0.0006, "reward": 3.2847015857696533, "reward_std": 0.057460593059659004, "rewards/final_reward": 0.9617596873774227, "rewards/mask_iou_reward": 0.48087984368871134, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.284701406955719, "rewards/thk_ans_format_reward": 1.0, "step": 1974, "think_completion_length": 51.1875 }, { "clip_ratio": 0.0, "completion_length": 185.78125, "epoch": 3.3355817875210794, "grad_norm": 25.024892296128783, "kl": 0.478515625, "learning_rate": 3.338954468802698e-07, "loss": 0.0005, "reward": 3.3408730030059814, "reward_std": 0.061326127499341965, "rewards/final_reward": 1.5239473580254677, "rewards/mask_iou_reward": 0.7619736790127338, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.340872883796692, "rewards/thk_ans_format_reward": 1.0, "step": 1975, "think_completion_length": 48.59375 }, { "clip_ratio": 0.0, "completion_length": 119.109375, "epoch": 3.3372681281618886, "grad_norm": 12.554408530560169, "kl": 0.5546875, "learning_rate": 3.335581787521079e-07, "loss": 0.0005, "reward": 2.9226410388946533, "reward_std": 0.2110334150493145, "rewards/final_reward": 0.45685307238001177, "rewards/mask_iou_reward": 0.22842653619000589, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9226409792900085, "rewards/thk_ans_format_reward": 1.0, "step": 1976, "think_completion_length": 46.4375 }, { "clip_ratio": 0.0, "completion_length": 148.71875, "epoch": 3.338954468802698, "grad_norm": 9.750316649812305, "kl": 0.4951171875, "learning_rate": 3.3322091062394605e-07, "loss": 0.0005, "reward": 3.3409875631332397, "reward_std": 0.10255017504096031, "rewards/final_reward": 0.9411056049973254, "rewards/mask_iou_reward": 0.4705528024986627, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3409876227378845, "rewards/thk_ans_format_reward": 1.0, "step": 1977, "think_completion_length": 41.96875 }, { "clip_ratio": 0.0, "completion_length": 118.1875, "epoch": 3.3406408094435074, "grad_norm": 6.12924664923464, "kl": 0.599609375, "learning_rate": 3.3288364249578414e-07, "loss": 0.0006, "reward": 3.2812459468841553, "reward_std": 0.033132096752524376, "rewards/final_reward": 0.9117269831595742, "rewards/mask_iou_reward": 0.4558634915797871, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2812458276748657, "rewards/thk_ans_format_reward": 1.0, "step": 1978, "think_completion_length": 49.59375 }, { "clip_ratio": 0.0, "completion_length": 118.40625, "epoch": 3.342327150084317, "grad_norm": 12.976404555979629, "kl": 0.5234375, "learning_rate": 3.325463743676222e-07, "loss": 0.0005, "reward": 3.1836129426956177, "reward_std": 0.0405933503061533, "rewards/final_reward": 0.7397104617970125, "rewards/mask_iou_reward": 0.36985523089850625, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.183612883090973, "rewards/thk_ans_format_reward": 1.0, "step": 1979, "think_completion_length": 49.1875 }, { "clip_ratio": 0.0, "completion_length": 115.53125, "epoch": 3.3440134907251267, "grad_norm": 6.388749541296147, "kl": 0.578125, "learning_rate": 3.3220910623946036e-07, "loss": 0.0006, "reward": 3.7560627460479736, "reward_std": 0.01202178793027997, "rewards/final_reward": 1.8708323976728498, "rewards/mask_iou_reward": 0.9354161988364249, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.756062626838684, "rewards/thk_ans_format_reward": 1.0, "step": 1980, "think_completion_length": 48.9375 }, { "clip_ratio": 0.0, "completion_length": 115.59375, "epoch": 3.345699831365936, "grad_norm": 45.06176010876029, "kl": 0.6015625, "learning_rate": 3.318718381112985e-07, "loss": 0.0006, "reward": 3.0571417808532715, "reward_std": 0.10221374221146107, "rewards/final_reward": 1.0803357033982843, "rewards/mask_iou_reward": 0.5401678516991422, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0571418702602386, "rewards/thk_ans_format_reward": 1.0, "step": 1981, "think_completion_length": 43.65625 }, { "clip_ratio": 0.0, "completion_length": 173.453125, "epoch": 3.3473861720067455, "grad_norm": 7.820569038981191, "kl": 0.591796875, "learning_rate": 3.3153456998313654e-07, "loss": 0.0006, "reward": 3.6929433345794678, "reward_std": 0.1502692373469472, "rewards/final_reward": 1.646944216830329, "rewards/mask_iou_reward": 0.8234721084151645, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6929433345794678, "rewards/thk_ans_format_reward": 1.0, "step": 1982, "think_completion_length": 52.53125 }, { "clip_ratio": 0.0, "completion_length": 116.3125, "epoch": 3.3490725126475547, "grad_norm": 7.5418838910115396, "kl": 0.61328125, "learning_rate": 3.311973018549747e-07, "loss": 0.0006, "reward": 3.5247732400894165, "reward_std": 0.04335535317659378, "rewards/final_reward": 1.846402055210505, "rewards/mask_iou_reward": 0.9232010276052525, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5247732400894165, "rewards/thk_ans_format_reward": 1.0, "step": 1983, "think_completion_length": 45.5 }, { "clip_ratio": 0.0, "completion_length": 219.46875, "epoch": 3.3507588532883643, "grad_norm": 6.148037786110945, "kl": 0.443359375, "learning_rate": 3.308600337268128e-07, "loss": 0.0004, "reward": 3.2120453119277954, "reward_std": 0.1635238453745842, "rewards/final_reward": 1.1920128923702205, "rewards/mask_iou_reward": 0.5960064461851102, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2120453119277954, "rewards/thk_ans_format_reward": 1.0, "step": 1984, "think_completion_length": 41.96875 }, { "clip_ratio": 0.0, "completion_length": 132.171875, "epoch": 3.3524451939291735, "grad_norm": 6.184763419654266, "kl": 0.568359375, "learning_rate": 3.305227655986509e-07, "loss": 0.0006, "reward": 3.331853151321411, "reward_std": 0.043349689804017544, "rewards/final_reward": 1.3908189338518653, "rewards/mask_iou_reward": 0.6954094669259326, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3318531513214111, "rewards/thk_ans_format_reward": 1.0, "step": 1985, "think_completion_length": 45.09375 }, { "clip_ratio": 0.0, "completion_length": 205.734375, "epoch": 3.354131534569983, "grad_norm": 8.375346934472132, "kl": 0.455078125, "learning_rate": 3.30185497470489e-07, "loss": 0.0005, "reward": 3.3176496028900146, "reward_std": 0.08942844346165657, "rewards/final_reward": 1.2392748999825676, "rewards/mask_iou_reward": 0.6196374499912838, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.317649632692337, "rewards/thk_ans_format_reward": 1.0, "step": 1986, "think_completion_length": 43.53125 }, { "clip_ratio": 0.0, "completion_length": 229.984375, "epoch": 3.3558178752107928, "grad_norm": 18.287534548610328, "kl": 0.578125, "learning_rate": 3.2984822934232713e-07, "loss": 0.0005, "reward": 3.329722285270691, "reward_std": 0.27538200467824936, "rewards/final_reward": 0.9544553569256847, "rewards/mask_iou_reward": 0.47722767846284236, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.3609723448753357, "rewards/thk_ans_format_reward": 0.984375, "step": 1987, "think_completion_length": 48.53125 }, { "clip_ratio": 0.0, "completion_length": 112.875, "epoch": 3.357504215851602, "grad_norm": 7.456631450698301, "kl": 0.560546875, "learning_rate": 3.295109612141653e-07, "loss": 0.0006, "reward": 3.1098419427871704, "reward_std": 0.17263797670602798, "rewards/final_reward": 1.04505706196219, "rewards/mask_iou_reward": 0.522528530981095, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1098419725894928, "rewards/thk_ans_format_reward": 1.0, "step": 1988, "think_completion_length": 43.65625 }, { "clip_ratio": 0.0, "completion_length": 114.546875, "epoch": 3.3591905564924116, "grad_norm": 5.7076069066498025, "kl": 0.5390625, "learning_rate": 3.2917369308600336e-07, "loss": 0.0005, "reward": 3.655590057373047, "reward_std": 0.2154662348330021, "rewards/final_reward": 1.5834517482320138, "rewards/mask_iou_reward": 0.7917258741160069, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6555900573730469, "rewards/thk_ans_format_reward": 1.0, "step": 1989, "think_completion_length": 42.53125 }, { "clip_ratio": 0.0, "completion_length": 139.515625, "epoch": 3.360876897133221, "grad_norm": 7.505812510113379, "kl": 0.548828125, "learning_rate": 3.2883642495784145e-07, "loss": 0.0005, "reward": 3.279410719871521, "reward_std": 0.1372041329741478, "rewards/final_reward": 1.574243533042608, "rewards/mask_iou_reward": 0.787121766521304, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2794106602668762, "rewards/thk_ans_format_reward": 1.0, "step": 1990, "think_completion_length": 45.28125 }, { "clip_ratio": 0.0, "completion_length": 115.765625, "epoch": 3.3625632377740304, "grad_norm": 9.130921737801906, "kl": 0.5546875, "learning_rate": 3.284991568296796e-07, "loss": 0.0006, "reward": 3.6248198747634888, "reward_std": 0.29395322501659393, "rewards/final_reward": 1.6349420585259278, "rewards/mask_iou_reward": 0.8174710292629639, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6248198747634888, "rewards/thk_ans_format_reward": 1.0, "step": 1991, "think_completion_length": 52.34375 }, { "clip_ratio": 0.0, "completion_length": 148.640625, "epoch": 3.3642495784148396, "grad_norm": 19.766241043754633, "kl": 0.484375, "learning_rate": 3.281618887015177e-07, "loss": 0.0005, "reward": 3.090684175491333, "reward_std": 0.11629136651754379, "rewards/final_reward": 0.7971860584649041, "rewards/mask_iou_reward": 0.39859302923245205, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0906842648983002, "rewards/thk_ans_format_reward": 1.0, "step": 1992, "think_completion_length": 44.375 }, { "clip_ratio": 0.0, "completion_length": 144.390625, "epoch": 3.3659359190556493, "grad_norm": 84.45935300190781, "kl": 0.5625, "learning_rate": 3.278246205733558e-07, "loss": 0.0006, "reward": 3.4176278114318848, "reward_std": 0.12633688002824783, "rewards/final_reward": 1.6584649862155265, "rewards/mask_iou_reward": 0.8292324931077633, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.41762775182724, "rewards/thk_ans_format_reward": 1.0, "step": 1993, "think_completion_length": 45.1875 }, { "clip_ratio": 0.0, "completion_length": 114.90625, "epoch": 3.367622259696459, "grad_norm": 8.5894189382737, "kl": 0.568359375, "learning_rate": 3.274873524451939e-07, "loss": 0.0006, "reward": 3.543907880783081, "reward_std": 0.0963448672555387, "rewards/final_reward": 1.799001571553526, "rewards/mask_iou_reward": 0.899500785776763, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5439078211784363, "rewards/thk_ans_format_reward": 1.0, "step": 1994, "think_completion_length": 42.8125 }, { "clip_ratio": 0.0, "completion_length": 116.1875, "epoch": 3.369308600337268, "grad_norm": 5.872835614458384, "kl": 0.62890625, "learning_rate": 3.27150084317032e-07, "loss": 0.0007, "reward": 3.2476214170455933, "reward_std": 0.09465612005442381, "rewards/final_reward": 1.18733627472457, "rewards/mask_iou_reward": 0.593668137362285, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2476215362548828, "rewards/thk_ans_format_reward": 1.0, "step": 1995, "think_completion_length": 44.125 }, { "clip_ratio": 0.0, "completion_length": 123.21875, "epoch": 3.3709949409780777, "grad_norm": 12.968097799327696, "kl": 0.685546875, "learning_rate": 3.2681281618887013e-07, "loss": 0.0007, "reward": 3.2468059062957764, "reward_std": 0.14746061153709888, "rewards/final_reward": 1.023684623301193, "rewards/mask_iou_reward": 0.5118423116505965, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2468059062957764, "rewards/thk_ans_format_reward": 1.0, "step": 1996, "think_completion_length": 41.5 }, { "clip_ratio": 0.0, "completion_length": 155.71875, "epoch": 3.372681281618887, "grad_norm": 14.993651185819589, "kl": 0.615234375, "learning_rate": 3.264755480607083e-07, "loss": 0.0006, "reward": 2.8943779468536377, "reward_std": 0.1070544458925724, "rewards/final_reward": 1.6384638413116381, "rewards/mask_iou_reward": 0.8192319206558191, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8943780958652496, "rewards/thk_ans_format_reward": 1.0, "step": 1997, "think_completion_length": 47.5 }, { "clip_ratio": 0.0, "completion_length": 174.09375, "epoch": 3.3743676222596966, "grad_norm": 4.5367073345316555, "kl": 0.5166015625, "learning_rate": 3.261382799325463e-07, "loss": 0.0005, "reward": 3.6099237203598022, "reward_std": 0.07347086956724524, "rewards/final_reward": 1.8372419839056675, "rewards/mask_iou_reward": 0.9186209919528338, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6099236011505127, "rewards/thk_ans_format_reward": 1.0, "step": 1998, "think_completion_length": 35.96875 }, { "clip_ratio": 0.0, "completion_length": 110.015625, "epoch": 3.3760539629005057, "grad_norm": 7.102930347949191, "kl": 0.615234375, "learning_rate": 3.2580101180438445e-07, "loss": 0.0006, "reward": 3.5327283143997192, "reward_std": 0.14749955013394356, "rewards/final_reward": 1.8147781307339632, "rewards/mask_iou_reward": 0.9073890653669816, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5327282547950745, "rewards/thk_ans_format_reward": 1.0, "step": 1999, "think_completion_length": 39.9375 }, { "clip_ratio": 0.0, "completion_length": 114.171875, "epoch": 3.3777403035413154, "grad_norm": 9.942553625687975, "kl": 0.537109375, "learning_rate": 3.254637436762226e-07, "loss": 0.0005, "reward": 3.547389268875122, "reward_std": 0.1251727119088173, "rewards/final_reward": 1.511609798042504, "rewards/mask_iou_reward": 0.755804899021252, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5473893284797668, "rewards/thk_ans_format_reward": 1.0, "step": 2000, "think_completion_length": 42.0 }, { "clip_ratio": 0.0, "completion_length": 112.984375, "epoch": 3.379426644182125, "grad_norm": 24.911534805683672, "kl": 1.146484375, "learning_rate": 3.2512647554806073e-07, "loss": 0.0011, "reward": 3.4626386165618896, "reward_std": 0.06342816725373268, "rewards/final_reward": 1.3687977350358613, "rewards/mask_iou_reward": 0.6843988675179307, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4626386761665344, "rewards/thk_ans_format_reward": 1.0, "step": 2001, "think_completion_length": 44.0 }, { "clip_ratio": 0.0, "completion_length": 135.46875, "epoch": 3.381112984822934, "grad_norm": 63.71804719258427, "kl": 0.5546875, "learning_rate": 3.247892074198988e-07, "loss": 0.0006, "reward": 3.0394468307495117, "reward_std": 0.27142253518104553, "rewards/final_reward": 1.1951839934034023, "rewards/mask_iou_reward": 0.5975919967017012, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0394466519355774, "rewards/thk_ans_format_reward": 1.0, "step": 2002, "think_completion_length": 45.8125 }, { "clip_ratio": 0.0, "completion_length": 113.609375, "epoch": 3.382799325463744, "grad_norm": 10.90467501879512, "kl": 0.57421875, "learning_rate": 3.244519392917369e-07, "loss": 0.0006, "reward": 3.1326643228530884, "reward_std": 0.041215680539608, "rewards/final_reward": 0.9408250897246508, "rewards/mask_iou_reward": 0.4704125448623254, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1326642036437988, "rewards/thk_ans_format_reward": 1.0, "step": 2003, "think_completion_length": 44.75 }, { "clip_ratio": 0.0, "completion_length": 125.6875, "epoch": 3.384485666104553, "grad_norm": 10.49759767307761, "kl": 0.568359375, "learning_rate": 3.2411467116357504e-07, "loss": 0.0006, "reward": 3.1833359003067017, "reward_std": 0.08077399618923664, "rewards/final_reward": 1.5277296883693114, "rewards/mask_iou_reward": 0.7638648441846557, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1833359003067017, "rewards/thk_ans_format_reward": 1.0, "step": 2004, "think_completion_length": 46.46875 }, { "clip_ratio": 0.0, "completion_length": 115.3125, "epoch": 3.3861720067453627, "grad_norm": 69.54637223102522, "kl": 0.6015625, "learning_rate": 3.2377740303541313e-07, "loss": 0.0006, "reward": 3.186821937561035, "reward_std": 0.07137523218989372, "rewards/final_reward": 1.4912311188442389, "rewards/mask_iou_reward": 0.7456155594221194, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.186821848154068, "rewards/thk_ans_format_reward": 1.0, "step": 2005, "think_completion_length": 49.90625 }, { "clip_ratio": 0.0, "completion_length": 117.375, "epoch": 3.387858347386172, "grad_norm": 9.035577647308175, "kl": 0.80078125, "learning_rate": 3.2344013490725127e-07, "loss": 0.0008, "reward": 3.420803427696228, "reward_std": 0.18055840581655502, "rewards/final_reward": 1.6420055230666772, "rewards/mask_iou_reward": 0.8210027615333386, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4208033084869385, "rewards/thk_ans_format_reward": 1.0, "step": 2006, "think_completion_length": 51.59375 }, { "clip_ratio": 0.0, "completion_length": 110.5, "epoch": 3.3895446880269815, "grad_norm": 8.342453926202667, "kl": 0.736328125, "learning_rate": 3.2310286677908936e-07, "loss": 0.0007, "reward": 3.167693018913269, "reward_std": 0.07003648579120636, "rewards/final_reward": 1.6847967851713777, "rewards/mask_iou_reward": 0.8423983925856888, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1676931977272034, "rewards/thk_ans_format_reward": 1.0, "step": 2007, "think_completion_length": 41.0 }, { "clip_ratio": 0.0, "completion_length": 111.8125, "epoch": 3.391231028667791, "grad_norm": 7.629456776734595, "kl": 0.6328125, "learning_rate": 3.2276559865092745e-07, "loss": 0.0006, "reward": 3.3508166074752808, "reward_std": 0.12238148972392082, "rewards/final_reward": 1.0337033341593962, "rewards/mask_iou_reward": 0.5168516670796981, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.350816547870636, "rewards/thk_ans_format_reward": 1.0, "step": 2008, "think_completion_length": 42.09375 }, { "clip_ratio": 0.0, "completion_length": 122.34375, "epoch": 3.3929173693086003, "grad_norm": 7.234444379509983, "kl": 0.66796875, "learning_rate": 3.224283305227656e-07, "loss": 0.0007, "reward": 3.4166321754455566, "reward_std": 0.23674443364143372, "rewards/final_reward": 1.1408130644550725, "rewards/mask_iou_reward": 0.5704065322275362, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4166321754455566, "rewards/thk_ans_format_reward": 1.0, "step": 2009, "think_completion_length": 44.65625 }, { "clip_ratio": 0.0, "completion_length": 118.984375, "epoch": 3.39460370994941, "grad_norm": 7.506944229344649, "kl": 0.58203125, "learning_rate": 3.2209106239460373e-07, "loss": 0.0006, "reward": 3.1270315647125244, "reward_std": 0.11961232125759125, "rewards/final_reward": 0.9250940028783735, "rewards/mask_iou_reward": 0.46254700143918676, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1270316243171692, "rewards/thk_ans_format_reward": 1.0, "step": 2010, "think_completion_length": 50.4375 }, { "clip_ratio": 0.0, "completion_length": 122.09375, "epoch": 3.396290050590219, "grad_norm": 5.081626548226494, "kl": 0.5185546875, "learning_rate": 3.2175379426644176e-07, "loss": 0.0005, "reward": 3.4909207820892334, "reward_std": 0.12029211595654488, "rewards/final_reward": 1.5802509611939328, "rewards/mask_iou_reward": 0.7901254805969664, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4909207820892334, "rewards/thk_ans_format_reward": 1.0, "step": 2011, "think_completion_length": 41.5 }, { "clip_ratio": 0.0, "completion_length": 113.03125, "epoch": 3.397976391231029, "grad_norm": 7.583470125843207, "kl": 0.5419921875, "learning_rate": 3.214165261382799e-07, "loss": 0.0005, "reward": 3.3409262895584106, "reward_std": 0.14934771042317152, "rewards/final_reward": 1.3885411302330193, "rewards/mask_iou_reward": 0.6942705651165096, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3409262895584106, "rewards/thk_ans_format_reward": 1.0, "step": 2012, "think_completion_length": 42.75 }, { "clip_ratio": 0.0, "completion_length": 134.9375, "epoch": 3.399662731871838, "grad_norm": 8.621093358210288, "kl": 0.537109375, "learning_rate": 3.2107925801011804e-07, "loss": 0.0005, "reward": 3.067779302597046, "reward_std": 0.27544330805540085, "rewards/final_reward": 0.8974693611197712, "rewards/mask_iou_reward": 0.4487346805598856, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0677792429924011, "rewards/thk_ans_format_reward": 1.0, "step": 2013, "think_completion_length": 44.53125 }, { "clip_ratio": 0.0, "completion_length": 154.9375, "epoch": 3.4013490725126476, "grad_norm": 10.964660941089583, "kl": 0.6328125, "learning_rate": 3.207419898819562e-07, "loss": 0.0006, "reward": 3.358281373977661, "reward_std": 0.06170746497809887, "rewards/final_reward": 0.8549180381163555, "rewards/mask_iou_reward": 0.42745901905817774, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3582815527915955, "rewards/thk_ans_format_reward": 1.0, "step": 2014, "think_completion_length": 41.5625 }, { "clip_ratio": 0.0, "completion_length": 170.8125, "epoch": 3.403035413153457, "grad_norm": 6.593952928176873, "kl": 0.5068359375, "learning_rate": 3.204047217537942e-07, "loss": 0.0005, "reward": 3.2059460878372192, "reward_std": 0.1293669156730175, "rewards/final_reward": 1.1842376119410463, "rewards/mask_iou_reward": 0.5921188059705231, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2059460878372192, "rewards/thk_ans_format_reward": 1.0, "step": 2015, "think_completion_length": 44.78125 }, { "clip_ratio": 0.0, "completion_length": 110.859375, "epoch": 3.4047217537942664, "grad_norm": 16.12581705453544, "kl": 0.60546875, "learning_rate": 3.2006745362563236e-07, "loss": 0.0006, "reward": 3.2048619985580444, "reward_std": 0.17501818388700485, "rewards/final_reward": 1.3144665586082775, "rewards/mask_iou_reward": 0.6572332793041388, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2048619985580444, "rewards/thk_ans_format_reward": 1.0, "step": 2016, "think_completion_length": 41.21875 }, { "clip_ratio": 0.0, "completion_length": 115.265625, "epoch": 3.4064080944350756, "grad_norm": 5.37399183205758, "kl": 0.552734375, "learning_rate": 3.197301854974705e-07, "loss": 0.0006, "reward": 3.5387697219848633, "reward_std": 0.011349121574312449, "rewards/final_reward": 1.845533327452562, "rewards/mask_iou_reward": 0.922766663726281, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.538769781589508, "rewards/thk_ans_format_reward": 1.0, "step": 2017, "think_completion_length": 43.90625 }, { "clip_ratio": 0.0, "completion_length": 214.296875, "epoch": 3.4080944350758853, "grad_norm": 25.972066541966154, "kl": 0.4873046875, "learning_rate": 3.193929173693086e-07, "loss": 0.0005, "reward": 3.243216633796692, "reward_std": 0.43901199474930763, "rewards/final_reward": 1.404129992277193, "rewards/mask_iou_reward": 0.7020649961385965, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 1.305716633796692, "rewards/thk_ans_format_reward": 0.96875, "step": 2018, "think_completion_length": 39.59375 }, { "clip_ratio": 0.0, "completion_length": 114.609375, "epoch": 3.409780775716695, "grad_norm": 12.503372878276222, "kl": 0.546875, "learning_rate": 3.1905564924114667e-07, "loss": 0.0005, "reward": 3.233471155166626, "reward_std": 0.021571812219917774, "rewards/final_reward": 0.977786239537827, "rewards/mask_iou_reward": 0.4888931197689135, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.233471155166626, "rewards/thk_ans_format_reward": 1.0, "step": 2019, "think_completion_length": 49.4375 }, { "clip_ratio": 0.0, "completion_length": 157.125, "epoch": 3.411467116357504, "grad_norm": 9.279611653873177, "kl": 0.552734375, "learning_rate": 3.187183811129848e-07, "loss": 0.0006, "reward": 3.442598581314087, "reward_std": 0.22078751027584076, "rewards/final_reward": 1.3976067600815076, "rewards/mask_iou_reward": 0.6988033800407538, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4425984621047974, "rewards/thk_ans_format_reward": 1.0, "step": 2020, "think_completion_length": 39.875 }, { "clip_ratio": 0.0, "completion_length": 185.765625, "epoch": 3.4131534569983137, "grad_norm": 5.152016285443864, "kl": 0.4921875, "learning_rate": 3.183811129848229e-07, "loss": 0.0005, "reward": 3.3022444248199463, "reward_std": 0.07596256211400032, "rewards/final_reward": 1.1442824262621978, "rewards/mask_iou_reward": 0.5721412131310989, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3022443056106567, "rewards/thk_ans_format_reward": 1.0, "step": 2021, "think_completion_length": 45.71875 }, { "clip_ratio": 0.0, "completion_length": 115.1875, "epoch": 3.414839797639123, "grad_norm": 71.59325436651602, "kl": 0.591796875, "learning_rate": 3.1804384485666104e-07, "loss": 0.0006, "reward": 2.967165946960449, "reward_std": 0.06337304785847664, "rewards/final_reward": 1.3569264816968474, "rewards/mask_iou_reward": 0.6784632408484237, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9671659171581268, "rewards/thk_ans_format_reward": 1.0, "step": 2022, "think_completion_length": 44.28125 }, { "clip_ratio": 0.0, "completion_length": 111.203125, "epoch": 3.4165261382799326, "grad_norm": 5.560328479002505, "kl": 0.619140625, "learning_rate": 3.1770657672849913e-07, "loss": 0.0006, "reward": 2.8386834859848022, "reward_std": 0.013690002728253603, "rewards/final_reward": 0.0, "rewards/mask_iou_reward": 0.0, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.838683545589447, "rewards/thk_ans_format_reward": 1.0, "step": 2023, "think_completion_length": 45.0 }, { "clip_ratio": 0.0, "completion_length": 143.109375, "epoch": 3.4182124789207418, "grad_norm": 10.56040792391872, "kl": 0.6484375, "learning_rate": 3.173693086003372e-07, "loss": 0.0006, "reward": 2.722140312194824, "reward_std": 0.06009296700358391, "rewards/final_reward": 1.0703529767006847, "rewards/mask_iou_reward": 0.5351764883503424, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7221404016017914, "rewards/thk_ans_format_reward": 1.0, "step": 2024, "think_completion_length": 45.15625 }, { "clip_ratio": 0.0, "completion_length": 183.03125, "epoch": 3.4198988195615514, "grad_norm": 19.433490902830165, "kl": 0.8984375, "learning_rate": 3.1703204047217536e-07, "loss": 0.0009, "reward": 3.1550711393356323, "reward_std": 0.5463046324439347, "rewards/final_reward": 1.3991503312967488, "rewards/mask_iou_reward": 0.6995751656483744, "rewards/sam_format_reward": 0.9375, "rewards/sam_reward_func_ultra": 1.280071198940277, "rewards/thk_ans_format_reward": 0.9375, "step": 2025, "think_completion_length": 47.125 }, { "clip_ratio": 0.0, "completion_length": 113.765625, "epoch": 3.421585160202361, "grad_norm": 17.87867964050926, "kl": 0.6328125, "learning_rate": 3.166947723440135e-07, "loss": 0.0006, "reward": 3.1133947372436523, "reward_std": 0.109176866710186, "rewards/final_reward": 1.1625815169505365, "rewards/mask_iou_reward": 0.5812907584752682, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1133947968482971, "rewards/thk_ans_format_reward": 1.0, "step": 2026, "think_completion_length": 45.46875 }, { "clip_ratio": 0.0, "completion_length": 115.0, "epoch": 3.4232715008431702, "grad_norm": 10.094195931918332, "kl": 0.55859375, "learning_rate": 3.1635750421585164e-07, "loss": 0.0006, "reward": 3.1443214416503906, "reward_std": 0.06616199389100075, "rewards/final_reward": 0.677497794267151, "rewards/mask_iou_reward": 0.3387488971335755, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1443215012550354, "rewards/thk_ans_format_reward": 1.0, "step": 2027, "think_completion_length": 42.59375 }, { "clip_ratio": 0.0, "completion_length": 122.171875, "epoch": 3.42495784148398, "grad_norm": 5.596862017138307, "kl": 0.552734375, "learning_rate": 3.1602023608768967e-07, "loss": 0.0006, "reward": 2.66789174079895, "reward_std": 0.3597968891263008, "rewards/final_reward": 0.12884763875733346, "rewards/mask_iou_reward": 0.06442381937866673, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.667891800403595, "rewards/thk_ans_format_reward": 1.0, "step": 2028, "think_completion_length": 42.59375 }, { "clip_ratio": 0.0, "completion_length": 124.140625, "epoch": 3.426644182124789, "grad_norm": 5.723799357111686, "kl": 0.568359375, "learning_rate": 3.156829679595278e-07, "loss": 0.0006, "reward": 3.7234787940979004, "reward_std": 0.034357505617663264, "rewards/final_reward": 1.5354280173268653, "rewards/mask_iou_reward": 0.7677140086634326, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7234787940979004, "rewards/thk_ans_format_reward": 1.0, "step": 2029, "think_completion_length": 43.1875 }, { "clip_ratio": 0.0, "completion_length": 114.84375, "epoch": 3.4283305227655987, "grad_norm": 25.451963993287492, "kl": 0.59375, "learning_rate": 3.1534569983136595e-07, "loss": 0.0006, "reward": 3.073733687400818, "reward_std": 0.2858571792021394, "rewards/final_reward": 0.6562628353296543, "rewards/mask_iou_reward": 0.3281314176648272, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.073733627796173, "rewards/thk_ans_format_reward": 1.0, "step": 2030, "think_completion_length": 41.53125 }, { "clip_ratio": 0.0, "completion_length": 114.359375, "epoch": 3.430016863406408, "grad_norm": 8.554774241678064, "kl": 0.552734375, "learning_rate": 3.1500843170320404e-07, "loss": 0.0005, "reward": 3.527369976043701, "reward_std": 0.20771950855851173, "rewards/final_reward": 1.6726139266029252, "rewards/mask_iou_reward": 0.8363069633014626, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5273699164390564, "rewards/thk_ans_format_reward": 1.0, "step": 2031, "think_completion_length": 44.59375 }, { "clip_ratio": 0.0, "completion_length": 114.1875, "epoch": 3.4317032040472175, "grad_norm": 12.03124597768691, "kl": 0.634765625, "learning_rate": 3.146711635750421e-07, "loss": 0.0006, "reward": 3.084221363067627, "reward_std": 0.14684276282787323, "rewards/final_reward": 1.286284375341669, "rewards/mask_iou_reward": 0.6431421876708345, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0842213034629822, "rewards/thk_ans_format_reward": 1.0, "step": 2032, "think_completion_length": 45.6875 }, { "clip_ratio": 0.0, "completion_length": 125.96875, "epoch": 3.433389544688027, "grad_norm": 7.535228675402098, "kl": 0.5205078125, "learning_rate": 3.1433389544688027e-07, "loss": 0.0005, "reward": 3.4276663064956665, "reward_std": 0.13675907254219055, "rewards/final_reward": 1.650958786137227, "rewards/mask_iou_reward": 0.8254793930686135, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4276662468910217, "rewards/thk_ans_format_reward": 1.0, "step": 2033, "think_completion_length": 40.59375 }, { "clip_ratio": 0.0, "completion_length": 114.3125, "epoch": 3.4350758853288363, "grad_norm": 10.269806157418087, "kl": 0.60546875, "learning_rate": 3.1399662731871835e-07, "loss": 0.0006, "reward": 3.8080811500549316, "reward_std": 0.016493337228894234, "rewards/final_reward": 1.7206912997820765, "rewards/mask_iou_reward": 0.8603456498910382, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8080810904502869, "rewards/thk_ans_format_reward": 1.0, "step": 2034, "think_completion_length": 46.3125 }, { "clip_ratio": 0.0, "completion_length": 167.640625, "epoch": 3.436762225969646, "grad_norm": 87.29511846158675, "kl": 0.5078125, "learning_rate": 3.136593591905565e-07, "loss": 0.0005, "reward": 3.03287672996521, "reward_std": 0.12340293824672699, "rewards/final_reward": 1.4833359764083096, "rewards/mask_iou_reward": 0.7416679882041548, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0328766703605652, "rewards/thk_ans_format_reward": 1.0, "step": 2035, "think_completion_length": 40.59375 }, { "clip_ratio": 0.0, "completion_length": 113.4375, "epoch": 3.438448566610455, "grad_norm": 5.596236022765671, "kl": 0.671875, "learning_rate": 3.133220910623946e-07, "loss": 0.0007, "reward": 3.6413198709487915, "reward_std": 0.041382129304111004, "rewards/final_reward": 1.4613938088920493, "rewards/mask_iou_reward": 0.7306969044460246, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6413196921348572, "rewards/thk_ans_format_reward": 1.0, "step": 2036, "think_completion_length": 45.0625 }, { "clip_ratio": 0.0, "completion_length": 112.125, "epoch": 3.440134907251265, "grad_norm": 6.71781932368155, "kl": 0.59375, "learning_rate": 3.129848229342327e-07, "loss": 0.0006, "reward": 3.355802297592163, "reward_std": 0.20005353540182114, "rewards/final_reward": 1.2739943643089002, "rewards/mask_iou_reward": 0.6369971821544501, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3558022379875183, "rewards/thk_ans_format_reward": 1.0, "step": 2037, "think_completion_length": 39.15625 }, { "clip_ratio": 0.0, "completion_length": 113.296875, "epoch": 3.441821247892074, "grad_norm": 27.916419315586456, "kl": 0.525390625, "learning_rate": 3.126475548060708e-07, "loss": 0.0005, "reward": 3.251969575881958, "reward_std": 0.18746953457593918, "rewards/final_reward": 1.3600471383699275, "rewards/mask_iou_reward": 0.6800235691849638, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2519696354866028, "rewards/thk_ans_format_reward": 1.0, "step": 2038, "think_completion_length": 39.75 }, { "clip_ratio": 0.0, "completion_length": 115.9375, "epoch": 3.4435075885328836, "grad_norm": 7.2430728989528275, "kl": 0.66015625, "learning_rate": 3.1231028667790895e-07, "loss": 0.0007, "reward": 3.1523733139038086, "reward_std": 0.3076479956507683, "rewards/final_reward": 1.5067852862366253, "rewards/mask_iou_reward": 0.7533926431183127, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.167998194694519, "rewards/thk_ans_format_reward": 1.0, "step": 2039, "think_completion_length": 43.3125 }, { "clip_ratio": 0.0, "completion_length": 137.953125, "epoch": 3.4451939291736933, "grad_norm": 4.783105232050546, "kl": 0.509765625, "learning_rate": 3.1197301854974704e-07, "loss": 0.0005, "reward": 3.708993911743164, "reward_std": 0.05525432340800762, "rewards/final_reward": 1.8121373095830768, "rewards/mask_iou_reward": 0.9060686547915384, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7089937329292297, "rewards/thk_ans_format_reward": 1.0, "step": 2040, "think_completion_length": 41.65625 }, { "clip_ratio": 0.0, "completion_length": 117.34375, "epoch": 3.4468802698145025, "grad_norm": 8.11655541151862, "kl": 0.6875, "learning_rate": 3.116357504215851e-07, "loss": 0.0007, "reward": 3.1580730676651, "reward_std": 0.08902622014284134, "rewards/final_reward": 1.2540957808126234, "rewards/mask_iou_reward": 0.6270478904063117, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1580730378627777, "rewards/thk_ans_format_reward": 1.0, "step": 2041, "think_completion_length": 47.1875 }, { "clip_ratio": 0.0, "completion_length": 146.8125, "epoch": 3.448566610455312, "grad_norm": 5.143790494484055, "kl": 0.546875, "learning_rate": 3.1129848229342326e-07, "loss": 0.0005, "reward": 3.1901299953460693, "reward_std": 0.058364099357277155, "rewards/final_reward": 0.6773605533276124, "rewards/mask_iou_reward": 0.3386802766638062, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1901300251483917, "rewards/thk_ans_format_reward": 1.0, "step": 2042, "think_completion_length": 42.4375 }, { "clip_ratio": 0.0, "completion_length": 115.4375, "epoch": 3.4502529510961213, "grad_norm": 7.525608066964579, "kl": 0.55078125, "learning_rate": 3.109612141652614e-07, "loss": 0.0006, "reward": 3.572661876678467, "reward_std": 0.022327865473926067, "rewards/final_reward": 1.7100408556340767, "rewards/mask_iou_reward": 0.8550204278170384, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5726619958877563, "rewards/thk_ans_format_reward": 1.0, "step": 2043, "think_completion_length": 47.46875 }, { "clip_ratio": 0.0, "completion_length": 113.328125, "epoch": 3.451939291736931, "grad_norm": 16.392322277931605, "kl": 0.603515625, "learning_rate": 3.1062394603709944e-07, "loss": 0.0006, "reward": 3.355070114135742, "reward_std": 0.06577342934906483, "rewards/final_reward": 1.3422343824513452, "rewards/mask_iou_reward": 0.6711171912256726, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3550702929496765, "rewards/thk_ans_format_reward": 1.0, "step": 2044, "think_completion_length": 44.1875 }, { "clip_ratio": 0.0, "completion_length": 118.046875, "epoch": 3.45362563237774, "grad_norm": 16.5911277595788, "kl": 0.564453125, "learning_rate": 3.102866779089376e-07, "loss": 0.0006, "reward": 3.096205711364746, "reward_std": 0.23673050850629807, "rewards/final_reward": 1.0256573336674095, "rewards/mask_iou_reward": 0.5128286668337048, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.096205621957779, "rewards/thk_ans_format_reward": 1.0, "step": 2045, "think_completion_length": 43.90625 }, { "clip_ratio": 0.0, "completion_length": 113.109375, "epoch": 3.4553119730185498, "grad_norm": 9.168801627251366, "kl": 0.775390625, "learning_rate": 3.099494097807757e-07, "loss": 0.0008, "reward": 3.2718621492385864, "reward_std": 0.02805233560502529, "rewards/final_reward": 1.1143052034870835, "rewards/mask_iou_reward": 0.5571526017435418, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2718620896339417, "rewards/thk_ans_format_reward": 1.0, "step": 2046, "think_completion_length": 42.28125 }, { "clip_ratio": 0.0, "completion_length": 118.09375, "epoch": 3.4569983136593594, "grad_norm": 9.580867499703865, "kl": 0.57421875, "learning_rate": 3.096121416526138e-07, "loss": 0.0006, "reward": 3.309259295463562, "reward_std": 0.021198630332946777, "rewards/final_reward": 1.6347819329427153, "rewards/mask_iou_reward": 0.8173909664713577, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.309259295463562, "rewards/thk_ans_format_reward": 1.0, "step": 2047, "think_completion_length": 47.09375 }, { "clip_ratio": 0.0, "completion_length": 131.78125, "epoch": 3.4586846543001686, "grad_norm": 11.641386424264883, "kl": 0.54296875, "learning_rate": 3.092748735244519e-07, "loss": 0.0005, "reward": 3.419318437576294, "reward_std": 0.1268703443929553, "rewards/final_reward": 1.1149699646904263, "rewards/mask_iou_reward": 0.5574849823452132, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4193184971809387, "rewards/thk_ans_format_reward": 1.0, "step": 2048, "think_completion_length": 40.46875 }, { "clip_ratio": 0.0, "completion_length": 113.203125, "epoch": 3.460370994940978, "grad_norm": 10.457163008805031, "kl": 0.591796875, "learning_rate": 3.0893760539629004e-07, "loss": 0.0006, "reward": 3.4929925203323364, "reward_std": 0.06658563949167728, "rewards/final_reward": 1.3797770463612256, "rewards/mask_iou_reward": 0.6898885231806128, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4929924607276917, "rewards/thk_ans_format_reward": 1.0, "step": 2049, "think_completion_length": 44.625 }, { "clip_ratio": 0.0, "completion_length": 113.8125, "epoch": 3.4620573355817874, "grad_norm": 12.992427945172517, "kl": 0.57421875, "learning_rate": 3.086003372681282e-07, "loss": 0.0006, "reward": 3.475190281867981, "reward_std": 0.1516074314713478, "rewards/final_reward": 1.3570006034645838, "rewards/mask_iou_reward": 0.6785003017322919, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4751903414726257, "rewards/thk_ans_format_reward": 1.0, "step": 2050, "think_completion_length": 41.65625 }, { "clip_ratio": 0.0, "completion_length": 126.421875, "epoch": 3.463743676222597, "grad_norm": 10.236935710119583, "kl": 0.68359375, "learning_rate": 3.0826306913996626e-07, "loss": 0.0007, "reward": 3.1847715377807617, "reward_std": 0.059085357934236526, "rewards/final_reward": 0.7592109866888326, "rewards/mask_iou_reward": 0.3796054933444163, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.184771478176117, "rewards/thk_ans_format_reward": 1.0, "step": 2051, "think_completion_length": 44.28125 }, { "clip_ratio": 0.0, "completion_length": 123.65625, "epoch": 3.4654300168634062, "grad_norm": 67.39013973113407, "kl": 0.572265625, "learning_rate": 3.0792580101180435e-07, "loss": 0.0006, "reward": 3.023639678955078, "reward_std": 0.22101550735533237, "rewards/final_reward": 0.8818784059611318, "rewards/mask_iou_reward": 0.4409392029805659, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0236395299434662, "rewards/thk_ans_format_reward": 1.0, "step": 2052, "think_completion_length": 48.25 }, { "clip_ratio": 0.0, "completion_length": 142.390625, "epoch": 3.467116357504216, "grad_norm": 6.259382879220525, "kl": 0.517578125, "learning_rate": 3.075885328836425e-07, "loss": 0.0005, "reward": 3.081355333328247, "reward_std": 0.004808083031093702, "rewards/final_reward": 0.26189683733133445, "rewards/mask_iou_reward": 0.13094841866566723, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0813553929328918, "rewards/thk_ans_format_reward": 1.0, "step": 2053, "think_completion_length": 49.875 }, { "clip_ratio": 0.0, "completion_length": 130.015625, "epoch": 3.4688026981450255, "grad_norm": 12.992020712397627, "kl": 0.521484375, "learning_rate": 3.072512647554806e-07, "loss": 0.0005, "reward": 3.3531652688980103, "reward_std": 0.10442159976810217, "rewards/final_reward": 1.236209052600922, "rewards/mask_iou_reward": 0.618104526300461, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.353165328502655, "rewards/thk_ans_format_reward": 1.0, "step": 2054, "think_completion_length": 46.0625 }, { "clip_ratio": 0.0, "completion_length": 103.28125, "epoch": 3.4704890387858347, "grad_norm": 20.979004051163514, "kl": 0.65234375, "learning_rate": 3.069139966273187e-07, "loss": 0.0007, "reward": 3.363083004951477, "reward_std": 0.19395306333899498, "rewards/final_reward": 1.5368581470637899, "rewards/mask_iou_reward": 0.7684290735318949, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.363083004951477, "rewards/thk_ans_format_reward": 1.0, "step": 2055, "think_completion_length": 44.96875 }, { "clip_ratio": 0.0, "completion_length": 117.125, "epoch": 3.4721753794266443, "grad_norm": 4.559033940459481, "kl": 0.564453125, "learning_rate": 3.0657672849915686e-07, "loss": 0.0005, "reward": 3.707147717475891, "reward_std": 0.0350824692286551, "rewards/final_reward": 1.6528962266813516, "rewards/mask_iou_reward": 0.8264481133406758, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7071477174758911, "rewards/thk_ans_format_reward": 1.0, "step": 2056, "think_completion_length": 48.9375 }, { "clip_ratio": 0.0, "completion_length": 112.421875, "epoch": 3.4738617200674535, "grad_norm": 20.549147936309854, "kl": 0.53515625, "learning_rate": 3.062394603709949e-07, "loss": 0.0005, "reward": 2.985813856124878, "reward_std": 0.0662859920412302, "rewards/final_reward": 0.8053970687702292, "rewards/mask_iou_reward": 0.4026985343851146, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9858137965202332, "rewards/thk_ans_format_reward": 1.0, "step": 2057, "think_completion_length": 46.78125 }, { "clip_ratio": 0.0, "completion_length": 128.84375, "epoch": 3.475548060708263, "grad_norm": 9.984510333726956, "kl": 0.5546875, "learning_rate": 3.0590219224283303e-07, "loss": 0.0006, "reward": 2.868474006652832, "reward_std": 0.028040415607392788, "rewards/final_reward": 0.15886921552929686, "rewards/mask_iou_reward": 0.07943460776464843, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8684740662574768, "rewards/thk_ans_format_reward": 1.0, "step": 2058, "think_completion_length": 50.59375 }, { "clip_ratio": 0.0, "completion_length": 145.09375, "epoch": 3.4772344013490724, "grad_norm": 8.995561128045066, "kl": 0.478515625, "learning_rate": 3.055649241146712e-07, "loss": 0.0005, "reward": 3.6218186616897583, "reward_std": 0.07758180983364582, "rewards/final_reward": 1.6858512783896944, "rewards/mask_iou_reward": 0.8429256391948472, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.621818482875824, "rewards/thk_ans_format_reward": 1.0, "step": 2059, "think_completion_length": 47.6875 }, { "clip_ratio": 0.0, "completion_length": 114.875, "epoch": 3.478920741989882, "grad_norm": 13.009018434002423, "kl": 0.55078125, "learning_rate": 3.0522765598650926e-07, "loss": 0.0005, "reward": 3.158493995666504, "reward_std": 0.31041108816862106, "rewards/final_reward": 1.4331862329200882, "rewards/mask_iou_reward": 0.7165931164600441, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1584938764572144, "rewards/thk_ans_format_reward": 1.0, "step": 2060, "think_completion_length": 44.90625 }, { "clip_ratio": 0.0, "completion_length": 112.46875, "epoch": 3.4806070826306916, "grad_norm": 12.992123348597211, "kl": 0.5703125, "learning_rate": 3.0489038785834735e-07, "loss": 0.0006, "reward": 3.362663745880127, "reward_std": 0.03213449893519282, "rewards/final_reward": 1.0460226018486092, "rewards/mask_iou_reward": 0.5230113009243046, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3626638054847717, "rewards/thk_ans_format_reward": 1.0, "step": 2061, "think_completion_length": 41.84375 }, { "clip_ratio": 0.0, "completion_length": 115.421875, "epoch": 3.482293423271501, "grad_norm": 49.926997659929846, "kl": 0.529296875, "learning_rate": 3.045531197301855e-07, "loss": 0.0005, "reward": 3.6378896236419678, "reward_std": 0.1999678835272789, "rewards/final_reward": 1.5644028089646156, "rewards/mask_iou_reward": 0.7822014044823078, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.637889802455902, "rewards/thk_ans_format_reward": 1.0, "step": 2062, "think_completion_length": 45.53125 }, { "clip_ratio": 0.0, "completion_length": 116.6875, "epoch": 3.4839797639123105, "grad_norm": 17.026423042191496, "kl": 0.576171875, "learning_rate": 3.0421585160202363e-07, "loss": 0.0006, "reward": 3.0796077251434326, "reward_std": 0.15330487489700317, "rewards/final_reward": 1.2737249290539945, "rewards/mask_iou_reward": 0.6368624645269972, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0796076208353043, "rewards/thk_ans_format_reward": 1.0, "step": 2063, "think_completion_length": 47.9375 }, { "clip_ratio": 0.0, "completion_length": 140.296875, "epoch": 3.4856661045531196, "grad_norm": 7.000696048676124, "kl": 0.544921875, "learning_rate": 3.038785834738617e-07, "loss": 0.0005, "reward": 3.6956236362457275, "reward_std": 0.06873153895139694, "rewards/final_reward": 1.710976314726648, "rewards/mask_iou_reward": 0.855488157363324, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6956236958503723, "rewards/thk_ans_format_reward": 1.0, "step": 2064, "think_completion_length": 43.21875 }, { "clip_ratio": 0.0, "completion_length": 136.34375, "epoch": 3.4873524451939293, "grad_norm": 15.18804859176664, "kl": 0.56640625, "learning_rate": 3.035413153456998e-07, "loss": 0.0006, "reward": 3.263370156288147, "reward_std": 0.18835239857435226, "rewards/final_reward": 1.274296605476248, "rewards/mask_iou_reward": 0.637148302738124, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2633703351020813, "rewards/thk_ans_format_reward": 1.0, "step": 2065, "think_completion_length": 45.40625 }, { "clip_ratio": 0.0, "completion_length": 118.578125, "epoch": 3.4890387858347385, "grad_norm": 9.809415007636295, "kl": 0.513671875, "learning_rate": 3.0320404721753794e-07, "loss": 0.0005, "reward": 3.234055280685425, "reward_std": 0.1404377743601799, "rewards/final_reward": 0.9628708190382448, "rewards/mask_iou_reward": 0.4814354095191224, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2340553104877472, "rewards/thk_ans_format_reward": 1.0, "step": 2066, "think_completion_length": 47.125 }, { "clip_ratio": 0.0, "completion_length": 124.015625, "epoch": 3.490725126475548, "grad_norm": 18.435724356020913, "kl": 0.517578125, "learning_rate": 3.0286677908937603e-07, "loss": 0.0005, "reward": 3.07632577419281, "reward_std": 0.12655201670713723, "rewards/final_reward": 1.6996674723771896, "rewards/mask_iou_reward": 0.8498337361885948, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0763257145881653, "rewards/thk_ans_format_reward": 1.0, "step": 2067, "think_completion_length": 43.5625 }, { "clip_ratio": 0.0, "completion_length": 113.765625, "epoch": 3.4924114671163577, "grad_norm": 8.139177398384799, "kl": 0.53125, "learning_rate": 3.0252951096121417e-07, "loss": 0.0006, "reward": 3.680861234664917, "reward_std": 0.017464175820350647, "rewards/final_reward": 1.9384369036629767, "rewards/mask_iou_reward": 0.9692184518314884, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6808614134788513, "rewards/thk_ans_format_reward": 1.0, "step": 2068, "think_completion_length": 45.59375 }, { "clip_ratio": 0.0, "completion_length": 118.03125, "epoch": 3.494097807757167, "grad_norm": 6.302239880949431, "kl": 0.560546875, "learning_rate": 3.0219224283305226e-07, "loss": 0.0006, "reward": 3.526781678199768, "reward_std": 0.08949761837720871, "rewards/final_reward": 1.5264771402068722, "rewards/mask_iou_reward": 0.7632385701034361, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5267817974090576, "rewards/thk_ans_format_reward": 1.0, "step": 2069, "think_completion_length": 45.0 }, { "clip_ratio": 0.0, "completion_length": 149.09375, "epoch": 3.4957841483979766, "grad_norm": 10.267958861421658, "kl": 0.56640625, "learning_rate": 3.0185497470489035e-07, "loss": 0.0006, "reward": 3.438549757003784, "reward_std": 0.05048087425529957, "rewards/final_reward": 1.1073131439090815, "rewards/mask_iou_reward": 0.5536565719545408, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4385497570037842, "rewards/thk_ans_format_reward": 1.0, "step": 2070, "think_completion_length": 45.3125 }, { "clip_ratio": 0.0, "completion_length": 116.5, "epoch": 3.4974704890387858, "grad_norm": 9.64767925245289, "kl": 0.59375, "learning_rate": 3.015177065767285e-07, "loss": 0.0006, "reward": 3.215143084526062, "reward_std": 0.332039512693882, "rewards/final_reward": 1.2518763445173295, "rewards/mask_iou_reward": 0.6259381722586648, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.215143084526062, "rewards/thk_ans_format_reward": 1.0, "step": 2071, "think_completion_length": 46.25 }, { "clip_ratio": 0.0, "completion_length": 116.640625, "epoch": 3.4991568296795954, "grad_norm": 11.15703283424711, "kl": 0.595703125, "learning_rate": 3.0118043844856663e-07, "loss": 0.0006, "reward": 3.348411202430725, "reward_std": 0.022082495968788862, "rewards/final_reward": 0.8214227307899277, "rewards/mask_iou_reward": 0.41071136539496383, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3484110832214355, "rewards/thk_ans_format_reward": 1.0, "step": 2072, "think_completion_length": 50.4375 }, { "clip_ratio": 0.0, "completion_length": 129.625, "epoch": 3.5008431703204046, "grad_norm": 9.69937959345132, "kl": 0.5234375, "learning_rate": 3.0084317032040466e-07, "loss": 0.0005, "reward": 3.2116652727127075, "reward_std": 0.2600217703729868, "rewards/final_reward": 1.3683783973116639, "rewards/mask_iou_reward": 0.6841891986558319, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.2429153323173523, "rewards/thk_ans_format_reward": 0.984375, "step": 2073, "think_completion_length": 48.4375 }, { "clip_ratio": 0.0, "completion_length": 118.703125, "epoch": 3.5025295109612142, "grad_norm": 10.062390866369451, "kl": 0.501953125, "learning_rate": 3.005059021922428e-07, "loss": 0.0005, "reward": 3.4098644256591797, "reward_std": 0.15565502271056175, "rewards/final_reward": 1.2738018206229793, "rewards/mask_iou_reward": 0.6369009103114897, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4098644256591797, "rewards/thk_ans_format_reward": 1.0, "step": 2074, "think_completion_length": 48.25 }, { "clip_ratio": 0.0, "completion_length": 153.96875, "epoch": 3.504215851602024, "grad_norm": 15.480097162634573, "kl": 0.5146484375, "learning_rate": 3.0016863406408094e-07, "loss": 0.0005, "reward": 3.035758376121521, "reward_std": 0.14913135021924973, "rewards/final_reward": 1.049987013645798, "rewards/mask_iou_reward": 0.524993506822899, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0357585549354553, "rewards/thk_ans_format_reward": 1.0, "step": 2075, "think_completion_length": 47.09375 }, { "clip_ratio": 0.0, "completion_length": 131.71875, "epoch": 3.505902192242833, "grad_norm": 7.040263739913607, "kl": 0.58203125, "learning_rate": 2.998313659359191e-07, "loss": 0.0006, "reward": 3.4088956117630005, "reward_std": 0.12369058793410659, "rewards/final_reward": 1.4691792207676442, "rewards/mask_iou_reward": 0.7345896103838221, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4088955521583557, "rewards/thk_ans_format_reward": 1.0, "step": 2076, "think_completion_length": 46.28125 }, { "clip_ratio": 0.0, "completion_length": 117.109375, "epoch": 3.5075885328836423, "grad_norm": 8.947858317432594, "kl": 0.53515625, "learning_rate": 2.994940978077571e-07, "loss": 0.0005, "reward": 3.2634823322296143, "reward_std": 0.2896201773546636, "rewards/final_reward": 1.3893840332991332, "rewards/mask_iou_reward": 0.6946920166495666, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2634823322296143, "rewards/thk_ans_format_reward": 1.0, "step": 2077, "think_completion_length": 44.9375 }, { "clip_ratio": 0.0, "completion_length": 114.046875, "epoch": 3.509274873524452, "grad_norm": 11.935600740389882, "kl": 0.57421875, "learning_rate": 2.9915682967959526e-07, "loss": 0.0006, "reward": 3.2649558782577515, "reward_std": 0.41319380700588226, "rewards/final_reward": 1.4942187138608924, "rewards/mask_iou_reward": 0.7471093569304462, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2649559378623962, "rewards/thk_ans_format_reward": 1.0, "step": 2078, "think_completion_length": 46.90625 }, { "clip_ratio": 0.0, "completion_length": 118.890625, "epoch": 3.5109612141652615, "grad_norm": 9.365130805074894, "kl": 0.576171875, "learning_rate": 2.988195615514334e-07, "loss": 0.0006, "reward": 3.0650811195373535, "reward_std": 0.06612196192145348, "rewards/final_reward": 0.7571907244586673, "rewards/mask_iou_reward": 0.37859536222933365, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0650811791419983, "rewards/thk_ans_format_reward": 1.0, "step": 2079, "think_completion_length": 49.15625 }, { "clip_ratio": 0.0, "completion_length": 115.765625, "epoch": 3.5126475548060707, "grad_norm": 8.560600749245339, "kl": 0.505859375, "learning_rate": 2.984822934232715e-07, "loss": 0.0005, "reward": 3.578374981880188, "reward_std": 0.3707886040210724, "rewards/final_reward": 1.6643127275437786, "rewards/mask_iou_reward": 0.8321563637718893, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5783750414848328, "rewards/thk_ans_format_reward": 1.0, "step": 2080, "think_completion_length": 45.96875 }, { "clip_ratio": 0.0, "completion_length": 119.015625, "epoch": 3.5143338954468804, "grad_norm": 7.321228522701317, "kl": 0.548828125, "learning_rate": 2.9814502529510957e-07, "loss": 0.0005, "reward": 2.8969147205352783, "reward_std": 0.21056237444281578, "rewards/final_reward": 1.0241504374650594, "rewards/mask_iou_reward": 0.5120752187325297, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.89691461622715, "rewards/thk_ans_format_reward": 1.0, "step": 2081, "think_completion_length": 45.46875 }, { "clip_ratio": 0.0, "completion_length": 110.328125, "epoch": 3.51602023608769, "grad_norm": 23.63771581498818, "kl": 0.611328125, "learning_rate": 2.978077571669477e-07, "loss": 0.0006, "reward": 3.10478937625885, "reward_std": 0.08834952488541603, "rewards/final_reward": 0.9242398913792284, "rewards/mask_iou_reward": 0.4621199456896142, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1047892272472382, "rewards/thk_ans_format_reward": 1.0, "step": 2082, "think_completion_length": 42.28125 }, { "clip_ratio": 0.0, "completion_length": 115.828125, "epoch": 3.517706576728499, "grad_norm": 8.000584246347364, "kl": 0.6640625, "learning_rate": 2.974704890387858e-07, "loss": 0.0006, "reward": 3.3999558687210083, "reward_std": 0.1016635000705719, "rewards/final_reward": 1.271658266660058, "rewards/mask_iou_reward": 0.635829133330029, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.399955689907074, "rewards/thk_ans_format_reward": 1.0, "step": 2083, "think_completion_length": 46.46875 }, { "clip_ratio": 0.0, "completion_length": 116.296875, "epoch": 3.5193929173693084, "grad_norm": 6.863503915469555, "kl": 0.677734375, "learning_rate": 2.9713322091062394e-07, "loss": 0.0007, "reward": 3.251119613647461, "reward_std": 0.05742297577671707, "rewards/final_reward": 0.8811621420140534, "rewards/mask_iou_reward": 0.4405810710070267, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2511195540428162, "rewards/thk_ans_format_reward": 1.0, "step": 2084, "think_completion_length": 45.375 }, { "clip_ratio": 0.0, "completion_length": 115.46875, "epoch": 3.521079258010118, "grad_norm": 10.149551426716297, "kl": 0.578125, "learning_rate": 2.967959527824621e-07, "loss": 0.0006, "reward": 3.618879556655884, "reward_std": 0.1719725530128926, "rewards/final_reward": 1.7024318654775046, "rewards/mask_iou_reward": 0.8512159327387523, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6188794374465942, "rewards/thk_ans_format_reward": 1.0, "step": 2085, "think_completion_length": 42.84375 }, { "clip_ratio": 0.0, "completion_length": 147.328125, "epoch": 3.5227655986509276, "grad_norm": 11.229567750401436, "kl": 0.53515625, "learning_rate": 2.964586846543001e-07, "loss": 0.0005, "reward": 3.2309197187423706, "reward_std": 0.06877991370856762, "rewards/final_reward": 0.8686235580294994, "rewards/mask_iou_reward": 0.4343117790147497, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2309198379516602, "rewards/thk_ans_format_reward": 1.0, "step": 2086, "think_completion_length": 48.0 }, { "clip_ratio": 0.0, "completion_length": 111.140625, "epoch": 3.524451939291737, "grad_norm": 8.016667921124851, "kl": 0.59375, "learning_rate": 2.9612141652613826e-07, "loss": 0.0006, "reward": 3.398813247680664, "reward_std": 0.2085051666945219, "rewards/final_reward": 1.6540931938829095, "rewards/mask_iou_reward": 0.8270465969414548, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3988131284713745, "rewards/thk_ans_format_reward": 1.0, "step": 2087, "think_completion_length": 41.65625 }, { "clip_ratio": 0.0, "completion_length": 124.78125, "epoch": 3.5261382799325465, "grad_norm": 8.287625638806832, "kl": 0.5234375, "learning_rate": 2.957841483979764e-07, "loss": 0.0005, "reward": 2.9071192741394043, "reward_std": 0.3184506855905056, "rewards/final_reward": 0.16841956742067227, "rewards/mask_iou_reward": 0.08420978371033613, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9071193039417267, "rewards/thk_ans_format_reward": 1.0, "step": 2088, "think_completion_length": 44.375 }, { "clip_ratio": 0.0, "completion_length": 115.34375, "epoch": 3.5278246205733557, "grad_norm": 8.820546084984757, "kl": 0.9140625, "learning_rate": 2.9544688026981454e-07, "loss": 0.0009, "reward": 3.3051774501800537, "reward_std": 0.057986740954220295, "rewards/final_reward": 1.616677442440604, "rewards/mask_iou_reward": 0.808338721220302, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3051775097846985, "rewards/thk_ans_format_reward": 1.0, "step": 2089, "think_completion_length": 47.875 }, { "clip_ratio": 0.0, "completion_length": 117.234375, "epoch": 3.5295109612141653, "grad_norm": 6.267811168411706, "kl": 0.552734375, "learning_rate": 2.9510961214165257e-07, "loss": 0.0006, "reward": 2.8216934204101562, "reward_std": 0.16679880395531654, "rewards/final_reward": 0.12753961957074536, "rewards/mask_iou_reward": 0.06376980978537268, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8216933906078339, "rewards/thk_ans_format_reward": 1.0, "step": 2090, "think_completion_length": 47.5625 }, { "clip_ratio": 0.0, "completion_length": 115.171875, "epoch": 3.5311973018549745, "grad_norm": 23.750264428638094, "kl": 0.583984375, "learning_rate": 2.947723440134907e-07, "loss": 0.0006, "reward": 3.4246087074279785, "reward_std": 0.2074672132730484, "rewards/final_reward": 1.331872885254827, "rewards/mask_iou_reward": 0.6659364426274135, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.424608588218689, "rewards/thk_ans_format_reward": 1.0, "step": 2091, "think_completion_length": 44.9375 }, { "clip_ratio": 0.0, "completion_length": 119.203125, "epoch": 3.532883642495784, "grad_norm": 27.6083076016815, "kl": 0.57421875, "learning_rate": 2.9443507588532885e-07, "loss": 0.0006, "reward": 3.294243812561035, "reward_std": 0.05477495677769184, "rewards/final_reward": 1.044439194811737, "rewards/mask_iou_reward": 0.5222195974058685, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2942437827587128, "rewards/thk_ans_format_reward": 1.0, "step": 2092, "think_completion_length": 52.59375 }, { "clip_ratio": 0.0, "completion_length": 114.171875, "epoch": 3.5345699831365938, "grad_norm": 11.86318269840444, "kl": 2.19140625, "learning_rate": 2.9409780775716694e-07, "loss": 0.0022, "reward": 3.617302179336548, "reward_std": 0.19097769260406494, "rewards/final_reward": 1.4741462494303408, "rewards/mask_iou_reward": 0.7370731247151704, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.617302119731903, "rewards/thk_ans_format_reward": 1.0, "step": 2093, "think_completion_length": 42.75 }, { "clip_ratio": 0.0, "completion_length": 177.359375, "epoch": 3.536256323777403, "grad_norm": 33.15216981502035, "kl": 0.5703125, "learning_rate": 2.9376053962900503e-07, "loss": 0.0006, "reward": 3.1339809894561768, "reward_std": 0.12357348203659058, "rewards/final_reward": 0.784641664259337, "rewards/mask_iou_reward": 0.3923208321296685, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.133980929851532, "rewards/thk_ans_format_reward": 1.0, "step": 2094, "think_completion_length": 42.25 }, { "clip_ratio": 0.0, "completion_length": 129.78125, "epoch": 3.5379426644182126, "grad_norm": 7.771322430647064, "kl": 0.533203125, "learning_rate": 2.9342327150084317e-07, "loss": 0.0005, "reward": 3.645893096923828, "reward_std": 0.17055297642946243, "rewards/final_reward": 1.548257813426118, "rewards/mask_iou_reward": 0.774128906713059, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6458930969238281, "rewards/thk_ans_format_reward": 1.0, "step": 2095, "think_completion_length": 45.4375 }, { "clip_ratio": 0.0, "completion_length": 116.890625, "epoch": 3.539629005059022, "grad_norm": 16.288824310059873, "kl": 0.5546875, "learning_rate": 2.9308600337268125e-07, "loss": 0.0006, "reward": 3.142457604408264, "reward_std": 0.24793227389454842, "rewards/final_reward": 1.0366077066693755, "rewards/mask_iou_reward": 0.5183038533346878, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1424575448036194, "rewards/thk_ans_format_reward": 1.0, "step": 2096, "think_completion_length": 47.40625 }, { "clip_ratio": 0.0, "completion_length": 117.578125, "epoch": 3.5413153456998314, "grad_norm": 7.628982481255148, "kl": 0.607421875, "learning_rate": 2.927487352445194e-07, "loss": 0.0006, "reward": 3.7804569005966187, "reward_std": 0.09163061529397964, "rewards/final_reward": 1.869259836081693, "rewards/mask_iou_reward": 0.9346299180408465, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7804570198059082, "rewards/thk_ans_format_reward": 1.0, "step": 2097, "think_completion_length": 48.125 }, { "clip_ratio": 0.0, "completion_length": 116.65625, "epoch": 3.5430016863406406, "grad_norm": 6.984260855630186, "kl": 0.5859375, "learning_rate": 2.924114671163575e-07, "loss": 0.0006, "reward": 3.530802369117737, "reward_std": 0.38594286143779755, "rewards/final_reward": 1.2610482569802852, "rewards/mask_iou_reward": 0.6305241284901426, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.5464274883270264, "rewards/thk_ans_format_reward": 1.0, "step": 2098, "think_completion_length": 47.40625 }, { "clip_ratio": 0.0, "completion_length": 115.3125, "epoch": 3.5446880269814502, "grad_norm": 14.513378746218695, "kl": 0.580078125, "learning_rate": 2.9207419898819557e-07, "loss": 0.0006, "reward": 3.3536789417266846, "reward_std": 0.06767284963279963, "rewards/final_reward": 1.630382387472711, "rewards/mask_iou_reward": 0.8151911937363555, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3536791801452637, "rewards/thk_ans_format_reward": 1.0, "step": 2099, "think_completion_length": 48.5 }, { "clip_ratio": 0.0, "completion_length": 129.53125, "epoch": 3.54637436762226, "grad_norm": 34.519418419568595, "kl": 0.494140625, "learning_rate": 2.917369308600337e-07, "loss": 0.0005, "reward": 3.3592960834503174, "reward_std": 0.10136066749691963, "rewards/final_reward": 0.8570358125793107, "rewards/mask_iou_reward": 0.42851790628965536, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3592960834503174, "rewards/thk_ans_format_reward": 1.0, "step": 2100, "think_completion_length": 51.5 }, { "clip_ratio": 0.0, "completion_length": 117.390625, "epoch": 3.548060708263069, "grad_norm": 4.852262057713968, "kl": 0.5546875, "learning_rate": 2.9139966273187185e-07, "loss": 0.0006, "reward": 2.8021715879440308, "reward_std": 0.014212753623723984, "rewards/final_reward": 0.7047118073511891, "rewards/mask_iou_reward": 0.35235590367559455, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8021714687347412, "rewards/thk_ans_format_reward": 1.0, "step": 2101, "think_completion_length": 45.84375 }, { "clip_ratio": 0.0, "completion_length": 138.96875, "epoch": 3.5497470489038787, "grad_norm": 70.12061063949474, "kl": 0.55859375, "learning_rate": 2.9106239460370994e-07, "loss": 0.0006, "reward": 3.3605268001556396, "reward_std": 0.2995372787117958, "rewards/final_reward": 1.2963307667207167, "rewards/mask_iou_reward": 0.6481653833603583, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3605269193649292, "rewards/thk_ans_format_reward": 1.0, "step": 2102, "think_completion_length": 50.71875 }, { "clip_ratio": 0.0, "completion_length": 116.09375, "epoch": 3.551433389544688, "grad_norm": 11.10089122007049, "kl": 0.544921875, "learning_rate": 2.90725126475548e-07, "loss": 0.0005, "reward": 3.38996422290802, "reward_std": 0.15020517259836197, "rewards/final_reward": 1.464758414052994, "rewards/mask_iou_reward": 0.732379207026497, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.38996422290802, "rewards/thk_ans_format_reward": 1.0, "step": 2103, "think_completion_length": 47.65625 }, { "clip_ratio": 0.0, "completion_length": 114.265625, "epoch": 3.5531197301854975, "grad_norm": 13.02278757136033, "kl": 0.55078125, "learning_rate": 2.9038785834738617e-07, "loss": 0.0006, "reward": 3.147314429283142, "reward_std": 0.14261020720005035, "rewards/final_reward": 1.344845371331086, "rewards/mask_iou_reward": 0.672422685665543, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1473142802715302, "rewards/thk_ans_format_reward": 1.0, "step": 2104, "think_completion_length": 41.5625 }, { "clip_ratio": 0.0, "completion_length": 131.78125, "epoch": 3.5548060708263067, "grad_norm": 11.260745371599246, "kl": 0.5166015625, "learning_rate": 2.900505902192243e-07, "loss": 0.0006, "reward": 3.1299134492874146, "reward_std": 0.04871807433664799, "rewards/final_reward": 1.3929734948617756, "rewards/mask_iou_reward": 0.6964867474308878, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1299134194850922, "rewards/thk_ans_format_reward": 1.0, "step": 2105, "think_completion_length": 50.75 }, { "clip_ratio": 0.0, "completion_length": 133.59375, "epoch": 3.5564924114671164, "grad_norm": 14.253061236679251, "kl": 0.525390625, "learning_rate": 2.8971332209106234e-07, "loss": 0.0005, "reward": 3.40402615070343, "reward_std": 0.23270705668255687, "rewards/final_reward": 1.2606720556852906, "rewards/mask_iou_reward": 0.6303360278426453, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4040260016918182, "rewards/thk_ans_format_reward": 1.0, "step": 2106, "think_completion_length": 48.21875 }, { "clip_ratio": 0.0, "completion_length": 113.125, "epoch": 3.558178752107926, "grad_norm": 9.729457631519049, "kl": 0.443359375, "learning_rate": 2.893760539629005e-07, "loss": 0.0004, "reward": 3.751418352127075, "reward_std": 0.25814956426620483, "rewards/final_reward": 1.8125, "rewards/mask_iou_reward": 0.90625, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7514183521270752, "rewards/thk_ans_format_reward": 1.0, "step": 2107, "think_completion_length": 44.90625 }, { "clip_ratio": 0.0, "completion_length": 124.625, "epoch": 3.559865092748735, "grad_norm": 9.294937389808833, "kl": 0.55859375, "learning_rate": 2.890387858347386e-07, "loss": 0.0006, "reward": 3.4791066646575928, "reward_std": 0.26504068821668625, "rewards/final_reward": 1.3581809205721262, "rewards/mask_iou_reward": 0.6790904602860631, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4791066646575928, "rewards/thk_ans_format_reward": 1.0, "step": 2108, "think_completion_length": 47.1875 }, { "clip_ratio": 0.0, "completion_length": 118.734375, "epoch": 3.561551433389545, "grad_norm": 10.856018909508016, "kl": 0.568359375, "learning_rate": 2.887015177065767e-07, "loss": 0.0006, "reward": 3.142184615135193, "reward_std": 0.06791120395064354, "rewards/final_reward": 1.3566191082046042, "rewards/mask_iou_reward": 0.6783095541023021, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1421846747398376, "rewards/thk_ans_format_reward": 1.0, "step": 2109, "think_completion_length": 48.5625 }, { "clip_ratio": 0.0, "completion_length": 115.125, "epoch": 3.563237774030354, "grad_norm": 6.750033064551902, "kl": 0.55859375, "learning_rate": 2.883642495784148e-07, "loss": 0.0006, "reward": 3.1395514011383057, "reward_std": 0.1944444328546524, "rewards/final_reward": 1.059547095767144, "rewards/mask_iou_reward": 0.529773547883572, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1395514905452728, "rewards/thk_ans_format_reward": 1.0, "step": 2110, "think_completion_length": 44.40625 }, { "clip_ratio": 0.0, "completion_length": 102.5625, "epoch": 3.5649241146711637, "grad_norm": 8.25354027902421, "kl": 0.517578125, "learning_rate": 2.8802698145025294e-07, "loss": 0.0006, "reward": 3.530856966972351, "reward_std": 0.1734582866774872, "rewards/final_reward": 1.8629343088128367, "rewards/mask_iou_reward": 0.9314671544064184, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5308570265769958, "rewards/thk_ans_format_reward": 1.0, "step": 2111, "think_completion_length": 46.4375 }, { "clip_ratio": 0.0, "completion_length": 113.53125, "epoch": 3.566610455311973, "grad_norm": 14.605550783250727, "kl": 0.533203125, "learning_rate": 2.876897133220911e-07, "loss": 0.0006, "reward": 3.368571400642395, "reward_std": 0.017685976112261415, "rewards/final_reward": 1.5584769869884578, "rewards/mask_iou_reward": 0.7792384934942289, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3685712814331055, "rewards/thk_ans_format_reward": 1.0, "step": 2112, "think_completion_length": 41.6875 }, { "clip_ratio": 0.0, "completion_length": 128.59375, "epoch": 3.5682967959527825, "grad_norm": 10.667209261447056, "kl": 0.5078125, "learning_rate": 2.8735244519392916e-07, "loss": 0.0005, "reward": 2.9554500579833984, "reward_std": 0.12431775592267513, "rewards/final_reward": 0.3111894509782488, "rewards/mask_iou_reward": 0.1555947254891244, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9554500877857208, "rewards/thk_ans_format_reward": 1.0, "step": 2113, "think_completion_length": 44.0625 }, { "clip_ratio": 0.0, "completion_length": 117.5, "epoch": 3.569983136593592, "grad_norm": 7.334798139651193, "kl": 0.537109375, "learning_rate": 2.870151770657673e-07, "loss": 0.0005, "reward": 3.0563929080963135, "reward_std": 0.16057665273547173, "rewards/final_reward": 0.77717497274694, "rewards/mask_iou_reward": 0.38858748637347, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0563929378986359, "rewards/thk_ans_format_reward": 1.0, "step": 2114, "think_completion_length": 48.28125 }, { "clip_ratio": 0.0, "completion_length": 126.53125, "epoch": 3.5716694772344013, "grad_norm": 9.965742861168454, "kl": 0.513671875, "learning_rate": 2.866779089376054e-07, "loss": 0.0005, "reward": 3.605561137199402, "reward_std": 0.06425703875720501, "rewards/final_reward": 1.7104030906984011, "rewards/mask_iou_reward": 0.8552015453492006, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6055611371994019, "rewards/thk_ans_format_reward": 1.0, "step": 2115, "think_completion_length": 42.21875 }, { "clip_ratio": 0.0, "completion_length": 116.859375, "epoch": 3.573355817875211, "grad_norm": 10.606011190407303, "kl": 0.5078125, "learning_rate": 2.863406408094435e-07, "loss": 0.0005, "reward": 3.346148729324341, "reward_std": 0.08017969503998756, "rewards/final_reward": 1.2055786473120078, "rewards/mask_iou_reward": 0.6027893236560039, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3461489081382751, "rewards/thk_ans_format_reward": 1.0, "step": 2116, "think_completion_length": 45.15625 }, { "clip_ratio": 0.0, "completion_length": 115.078125, "epoch": 3.57504215851602, "grad_norm": 16.286723500192668, "kl": 0.5068359375, "learning_rate": 2.860033726812816e-07, "loss": 0.0005, "reward": 3.3839882612228394, "reward_std": 0.0907436553388834, "rewards/final_reward": 1.8187623688096652, "rewards/mask_iou_reward": 0.9093811844048326, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.383988231420517, "rewards/thk_ans_format_reward": 1.0, "step": 2117, "think_completion_length": 43.15625 }, { "clip_ratio": 0.0, "completion_length": 116.90625, "epoch": 3.5767284991568298, "grad_norm": 7.10774667990878, "kl": 0.521484375, "learning_rate": 2.8566610455311976e-07, "loss": 0.0005, "reward": 3.3456366062164307, "reward_std": 0.03418039623647928, "rewards/final_reward": 1.7127510501139098, "rewards/mask_iou_reward": 0.8563755250569549, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3456366062164307, "rewards/thk_ans_format_reward": 1.0, "step": 2118, "think_completion_length": 44.4375 }, { "clip_ratio": 0.0, "completion_length": 120.109375, "epoch": 3.578414839797639, "grad_norm": 13.503292561770218, "kl": 0.55859375, "learning_rate": 2.853288364249578e-07, "loss": 0.0006, "reward": 3.763441801071167, "reward_std": 0.032608107663691044, "rewards/final_reward": 1.7397966034509587, "rewards/mask_iou_reward": 0.8698983017254793, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.763441801071167, "rewards/thk_ans_format_reward": 1.0, "step": 2119, "think_completion_length": 44.21875 }, { "clip_ratio": 0.0, "completion_length": 134.234375, "epoch": 3.5801011804384486, "grad_norm": 12.169735293725726, "kl": 0.61328125, "learning_rate": 2.8499156829679593e-07, "loss": 0.0006, "reward": 2.9567670822143555, "reward_std": 0.3452972024679184, "rewards/final_reward": 0.9493169032549391, "rewards/mask_iou_reward": 0.47465845162746956, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9567671567201614, "rewards/thk_ans_format_reward": 1.0, "step": 2120, "think_completion_length": 45.5625 }, { "clip_ratio": 0.0, "completion_length": 117.390625, "epoch": 3.5817875210792582, "grad_norm": 72.34801198564061, "kl": 0.595703125, "learning_rate": 2.846543001686341e-07, "loss": 0.0006, "reward": 3.4123685359954834, "reward_std": 0.07814565277658403, "rewards/final_reward": 1.441254424751838, "rewards/mask_iou_reward": 0.720627212375919, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4123685359954834, "rewards/thk_ans_format_reward": 1.0, "step": 2121, "think_completion_length": 46.21875 }, { "clip_ratio": 0.0, "completion_length": 117.046875, "epoch": 3.5834738617200674, "grad_norm": 18.001668662737146, "kl": 0.595703125, "learning_rate": 2.8431703204047216e-07, "loss": 0.0006, "reward": 3.4009355306625366, "reward_std": 0.226658396422863, "rewards/final_reward": 1.5326358044587276, "rewards/mask_iou_reward": 0.7663179022293638, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.4165604710578918, "rewards/thk_ans_format_reward": 1.0, "step": 2122, "think_completion_length": 49.71875 }, { "clip_ratio": 0.0, "completion_length": 122.734375, "epoch": 3.5851602023608766, "grad_norm": 6.756132574804486, "kl": 0.501953125, "learning_rate": 2.8397976391231025e-07, "loss": 0.0005, "reward": 3.0923595428466797, "reward_std": 0.4571046978235245, "rewards/final_reward": 0.6847192336418062, "rewards/mask_iou_reward": 0.3423596168209031, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0923596620559692, "rewards/thk_ans_format_reward": 1.0, "step": 2123, "think_completion_length": 44.5 }, { "clip_ratio": 0.0, "completion_length": 156.71875, "epoch": 3.5868465430016863, "grad_norm": 4.699953491289403, "kl": 0.443359375, "learning_rate": 2.836424957841484e-07, "loss": 0.0004, "reward": 3.352581024169922, "reward_std": 0.12979009747505188, "rewards/final_reward": 1.2676309486607986, "rewards/mask_iou_reward": 0.6338154743303993, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3525812029838562, "rewards/thk_ans_format_reward": 1.0, "step": 2124, "think_completion_length": 40.84375 }, { "clip_ratio": 0.0, "completion_length": 111.734375, "epoch": 3.588532883642496, "grad_norm": 9.479851297811361, "kl": 0.564453125, "learning_rate": 2.8330522765598653e-07, "loss": 0.0006, "reward": 3.3169244527816772, "reward_std": 0.07373960688710213, "rewards/final_reward": 1.3375584167549102, "rewards/mask_iou_reward": 0.6687792083774551, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.316924512386322, "rewards/thk_ans_format_reward": 1.0, "step": 2125, "think_completion_length": 44.125 }, { "clip_ratio": 0.0, "completion_length": 113.59375, "epoch": 3.590219224283305, "grad_norm": 9.675427623917646, "kl": 0.6015625, "learning_rate": 2.829679595278246e-07, "loss": 0.0006, "reward": 3.5323774814605713, "reward_std": 0.021160707343369722, "rewards/final_reward": 1.3137672896730819, "rewards/mask_iou_reward": 0.6568836448365409, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.532377541065216, "rewards/thk_ans_format_reward": 1.0, "step": 2126, "think_completion_length": 43.0625 }, { "clip_ratio": 0.0, "completion_length": 115.640625, "epoch": 3.5919055649241147, "grad_norm": 14.935509325663402, "kl": 0.60546875, "learning_rate": 2.826306913996627e-07, "loss": 0.0006, "reward": 3.5516492128372192, "reward_std": 0.07431310974061489, "rewards/final_reward": 1.1481026905321163, "rewards/mask_iou_reward": 0.5740513452660582, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5516492128372192, "rewards/thk_ans_format_reward": 1.0, "step": 2127, "think_completion_length": 43.84375 }, { "clip_ratio": 0.0, "completion_length": 245.484375, "epoch": 3.5935919055649244, "grad_norm": 14.142273296969108, "kl": 0.421875, "learning_rate": 2.8229342327150084e-07, "loss": 0.0004, "reward": 3.523800492286682, "reward_std": 0.23098544776439667, "rewards/final_reward": 1.2040698640666694, "rewards/mask_iou_reward": 0.6020349320333347, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.555050551891327, "rewards/thk_ans_format_reward": 0.984375, "step": 2128, "think_completion_length": 46.34375 }, { "clip_ratio": 0.0, "completion_length": 138.9375, "epoch": 3.5952782462057336, "grad_norm": 15.506942761325954, "kl": 0.541015625, "learning_rate": 2.8195615514333893e-07, "loss": 0.0005, "reward": 2.981836199760437, "reward_std": 0.05073126032948494, "rewards/final_reward": 1.829483273128682, "rewards/mask_iou_reward": 0.914741636564341, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.981836199760437, "rewards/thk_ans_format_reward": 1.0, "step": 2129, "think_completion_length": 44.71875 }, { "clip_ratio": 0.0, "completion_length": 114.84375, "epoch": 3.5969645868465427, "grad_norm": 13.470969997373983, "kl": 0.55078125, "learning_rate": 2.8161888701517707e-07, "loss": 0.0005, "reward": 3.1844255924224854, "reward_std": 0.09609784232452512, "rewards/final_reward": 1.4231125763751638, "rewards/mask_iou_reward": 0.7115562881875819, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.200050711631775, "rewards/thk_ans_format_reward": 0.984375, "step": 2130, "think_completion_length": 45.84375 }, { "clip_ratio": 0.0, "completion_length": 137.03125, "epoch": 3.5986509274873524, "grad_norm": 4.3856864863237845, "kl": 0.568359375, "learning_rate": 2.8128161888701516e-07, "loss": 0.0005, "reward": 3.5685439109802246, "reward_std": 0.07929562008939683, "rewards/final_reward": 1.2761401732846926, "rewards/mask_iou_reward": 0.6380700866423463, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.568543791770935, "rewards/thk_ans_format_reward": 1.0, "step": 2131, "think_completion_length": 46.59375 }, { "clip_ratio": 0.0, "completion_length": 115.1875, "epoch": 3.600337268128162, "grad_norm": 5.688163167135513, "kl": 0.595703125, "learning_rate": 2.8094435075885325e-07, "loss": 0.0006, "reward": 2.721096992492676, "reward_std": 0.0583833334967494, "rewards/final_reward": 0.7634238064732274, "rewards/mask_iou_reward": 0.3817119032366137, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7210970818996429, "rewards/thk_ans_format_reward": 1.0, "step": 2132, "think_completion_length": 44.75 }, { "clip_ratio": 0.0, "completion_length": 125.65625, "epoch": 3.602023608768971, "grad_norm": 5.832818627047222, "kl": 0.52734375, "learning_rate": 2.806070826306914e-07, "loss": 0.0005, "reward": 3.522038459777832, "reward_std": 0.04175010113976896, "rewards/final_reward": 1.7146074397476556, "rewards/mask_iou_reward": 0.8573037198738278, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5220386385917664, "rewards/thk_ans_format_reward": 1.0, "step": 2133, "think_completion_length": 44.78125 }, { "clip_ratio": 0.0, "completion_length": 128.65625, "epoch": 3.603709949409781, "grad_norm": 27.303612577890018, "kl": 0.599609375, "learning_rate": 2.8026981450252953e-07, "loss": 0.0006, "reward": 3.473210334777832, "reward_std": 0.07562324404716492, "rewards/final_reward": 1.753751732934803, "rewards/mask_iou_reward": 0.8768758664674015, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.473210334777832, "rewards/thk_ans_format_reward": 1.0, "step": 2134, "think_completion_length": 45.625 }, { "clip_ratio": 0.0, "completion_length": 135.40625, "epoch": 3.6053962900505905, "grad_norm": 6.878462017998155, "kl": 0.5546875, "learning_rate": 2.7993254637436756e-07, "loss": 0.0006, "reward": 3.054173469543457, "reward_std": 0.2945948615670204, "rewards/final_reward": 0.9387926834905617, "rewards/mask_iou_reward": 0.46939634174528083, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.085423469543457, "rewards/thk_ans_format_reward": 0.984375, "step": 2135, "think_completion_length": 47.53125 }, { "clip_ratio": 0.0, "completion_length": 128.90625, "epoch": 3.6070826306913997, "grad_norm": 10.653031644305647, "kl": 0.5625, "learning_rate": 2.795952782462057e-07, "loss": 0.0006, "reward": 3.5420931577682495, "reward_std": 0.0636497251689434, "rewards/final_reward": 1.819818398576865, "rewards/mask_iou_reward": 0.9099091992884325, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5420931577682495, "rewards/thk_ans_format_reward": 1.0, "step": 2136, "think_completion_length": 46.21875 }, { "clip_ratio": 0.0, "completion_length": 112.890625, "epoch": 3.608768971332209, "grad_norm": 10.843170586952544, "kl": 0.6015625, "learning_rate": 2.7925801011804384e-07, "loss": 0.0006, "reward": 3.737997055053711, "reward_std": 0.013210067059844732, "rewards/final_reward": 1.795223586591173, "rewards/mask_iou_reward": 0.8976117932955865, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7379971742630005, "rewards/thk_ans_format_reward": 1.0, "step": 2137, "think_completion_length": 41.65625 }, { "clip_ratio": 0.0, "completion_length": 120.640625, "epoch": 3.6104553119730185, "grad_norm": 16.198534079929008, "kl": 0.546875, "learning_rate": 2.78920741989882e-07, "loss": 0.0005, "reward": 3.37510347366333, "reward_std": 0.06835968187078834, "rewards/final_reward": 1.3537600217636836, "rewards/mask_iou_reward": 0.6768800108818418, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3751035332679749, "rewards/thk_ans_format_reward": 1.0, "step": 2138, "think_completion_length": 45.46875 }, { "clip_ratio": 0.0, "completion_length": 120.03125, "epoch": 3.612141652613828, "grad_norm": 5.91434083685604, "kl": 0.5859375, "learning_rate": 2.7858347386172e-07, "loss": 0.0006, "reward": 3.16507625579834, "reward_std": 0.0929767694324255, "rewards/final_reward": 1.3173989301904212, "rewards/mask_iou_reward": 0.6586994650952106, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1650761365890503, "rewards/thk_ans_format_reward": 1.0, "step": 2139, "think_completion_length": 42.84375 }, { "clip_ratio": 0.0, "completion_length": 118.28125, "epoch": 3.6138279932546373, "grad_norm": 8.373994110782936, "kl": 0.54296875, "learning_rate": 2.7824620573355816e-07, "loss": 0.0005, "reward": 3.7722244262695312, "reward_std": 0.06805156078189611, "rewards/final_reward": 1.8827298616919066, "rewards/mask_iou_reward": 0.9413649308459533, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7722243666648865, "rewards/thk_ans_format_reward": 1.0, "step": 2140, "think_completion_length": 40.78125 }, { "clip_ratio": 0.0, "completion_length": 123.53125, "epoch": 3.615514333895447, "grad_norm": 17.268896812503602, "kl": 0.556640625, "learning_rate": 2.779089376053963e-07, "loss": 0.0006, "reward": 2.971988797187805, "reward_std": 0.08881898410618305, "rewards/final_reward": 0.6324427287793092, "rewards/mask_iou_reward": 0.3162213643896546, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.97198885679245, "rewards/thk_ans_format_reward": 1.0, "step": 2141, "think_completion_length": 48.375 }, { "clip_ratio": 0.0, "completion_length": 112.65625, "epoch": 3.6172006745362566, "grad_norm": 6.8642257578860955, "kl": 0.609375, "learning_rate": 2.775716694772344e-07, "loss": 0.0006, "reward": 3.305831551551819, "reward_std": 0.19308728724718094, "rewards/final_reward": 0.966853426385776, "rewards/mask_iou_reward": 0.483426713192888, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3058315515518188, "rewards/thk_ans_format_reward": 1.0, "step": 2142, "think_completion_length": 43.59375 }, { "clip_ratio": 0.0, "completion_length": 263.609375, "epoch": 3.618887015177066, "grad_norm": 13.044769944871176, "kl": 0.4228515625, "learning_rate": 2.7723440134907247e-07, "loss": 0.0004, "reward": 3.3725874423980713, "reward_std": 0.1456987876445055, "rewards/final_reward": 1.3800859115573243, "rewards/mask_iou_reward": 0.6900429557786621, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3725875616073608, "rewards/thk_ans_format_reward": 1.0, "step": 2143, "think_completion_length": 43.21875 }, { "clip_ratio": 0.0, "completion_length": 98.4375, "epoch": 3.620573355817875, "grad_norm": 12.654651665103493, "kl": 0.5703125, "learning_rate": 2.768971332209106e-07, "loss": 0.0006, "reward": 3.2385579347610474, "reward_std": 0.20924285799264908, "rewards/final_reward": 1.1544221282593377, "rewards/mask_iou_reward": 0.5772110641296688, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2385579347610474, "rewards/thk_ans_format_reward": 1.0, "step": 2144, "think_completion_length": 38.25 }, { "clip_ratio": 0.0, "completion_length": 115.78125, "epoch": 3.6222596964586846, "grad_norm": 9.375804256565166, "kl": 0.66015625, "learning_rate": 2.765598650927487e-07, "loss": 0.0007, "reward": 3.3023223876953125, "reward_std": 0.09674730151891708, "rewards/final_reward": 1.332632176942337, "rewards/mask_iou_reward": 0.6663160884711685, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3023223280906677, "rewards/thk_ans_format_reward": 1.0, "step": 2145, "think_completion_length": 43.09375 }, { "clip_ratio": 0.0, "completion_length": 112.78125, "epoch": 3.6239460370994943, "grad_norm": 77.38104026847395, "kl": 0.5703125, "learning_rate": 2.7622259696458684e-07, "loss": 0.0006, "reward": 3.5058757066726685, "reward_std": 0.0840764888562262, "rewards/final_reward": 1.5100866179690322, "rewards/mask_iou_reward": 0.7550433089845161, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5058757662773132, "rewards/thk_ans_format_reward": 1.0, "step": 2146, "think_completion_length": 45.6875 }, { "clip_ratio": 0.0, "completion_length": 116.890625, "epoch": 3.6256323777403034, "grad_norm": 13.735014321000364, "kl": 0.564453125, "learning_rate": 2.75885328836425e-07, "loss": 0.0006, "reward": 3.601733446121216, "reward_std": 0.02019692724570632, "rewards/final_reward": 1.3094032991992814, "rewards/mask_iou_reward": 0.6547016495996407, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6017334461212158, "rewards/thk_ans_format_reward": 1.0, "step": 2147, "think_completion_length": 46.59375 }, { "clip_ratio": 0.0, "completion_length": 131.625, "epoch": 3.627318718381113, "grad_norm": 6.036157171143187, "kl": 0.50390625, "learning_rate": 2.75548060708263e-07, "loss": 0.0005, "reward": 3.8341445922851562, "reward_std": 0.07307778589893132, "rewards/final_reward": 1.8001251496910746, "rewards/mask_iou_reward": 0.9000625748455373, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8341445326805115, "rewards/thk_ans_format_reward": 1.0, "step": 2148, "think_completion_length": 46.4375 }, { "clip_ratio": 0.0, "completion_length": 127.734375, "epoch": 3.6290050590219223, "grad_norm": 7.463962254660814, "kl": 0.5625, "learning_rate": 2.7521079258010116e-07, "loss": 0.0005, "reward": 3.830557703971863, "reward_std": 0.0030419373651966453, "rewards/final_reward": 1.9596382233535177, "rewards/mask_iou_reward": 0.9798191116767588, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8305577635765076, "rewards/thk_ans_format_reward": 1.0, "step": 2149, "think_completion_length": 44.53125 }, { "clip_ratio": 0.0, "completion_length": 169.140625, "epoch": 3.630691399662732, "grad_norm": 18.007528140348732, "kl": 0.513671875, "learning_rate": 2.748735244519393e-07, "loss": 0.0005, "reward": 3.420504570007324, "reward_std": 0.07008447870612144, "rewards/final_reward": 1.7162632921924779, "rewards/mask_iou_reward": 0.8581316460962389, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4205045104026794, "rewards/thk_ans_format_reward": 1.0, "step": 2150, "think_completion_length": 47.09375 }, { "clip_ratio": 0.0, "completion_length": 124.453125, "epoch": 3.632377740303541, "grad_norm": 6.8912677980775126, "kl": 0.57421875, "learning_rate": 2.7453625632377744e-07, "loss": 0.0006, "reward": 3.775565028190613, "reward_std": 0.05060883052647114, "rewards/final_reward": 1.847828195153058, "rewards/mask_iou_reward": 0.923914097576529, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7755651473999023, "rewards/thk_ans_format_reward": 1.0, "step": 2151, "think_completion_length": 43.8125 }, { "clip_ratio": 0.0, "completion_length": 105.171875, "epoch": 3.6340640809443507, "grad_norm": 10.819998287532302, "kl": 0.5234375, "learning_rate": 2.7419898819561547e-07, "loss": 0.0005, "reward": 3.4836515188217163, "reward_std": 0.36593368649482727, "rewards/final_reward": 1.224108838536167, "rewards/mask_iou_reward": 0.6120544192680835, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4836515188217163, "rewards/thk_ans_format_reward": 1.0, "step": 2152, "think_completion_length": 42.53125 }, { "clip_ratio": 0.0, "completion_length": 114.515625, "epoch": 3.6357504215851604, "grad_norm": 6.744144134909996, "kl": 0.53125, "learning_rate": 2.738617200674536e-07, "loss": 0.0005, "reward": 3.1626614332199097, "reward_std": 0.12014555651694536, "rewards/final_reward": 1.2144468510600666, "rewards/mask_iou_reward": 0.6072234255300333, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.162661463022232, "rewards/thk_ans_format_reward": 1.0, "step": 2153, "think_completion_length": 40.6875 }, { "clip_ratio": 0.0, "completion_length": 116.03125, "epoch": 3.6374367622259696, "grad_norm": 26.616278710272272, "kl": 0.564453125, "learning_rate": 2.7352445193929175e-07, "loss": 0.0006, "reward": 2.7842090129852295, "reward_std": 0.33606940880417824, "rewards/final_reward": 0.5275331489239576, "rewards/mask_iou_reward": 0.2637665744619788, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7842088639736176, "rewards/thk_ans_format_reward": 1.0, "step": 2154, "think_completion_length": 41.375 }, { "clip_ratio": 0.0, "completion_length": 198.0, "epoch": 3.639123102866779, "grad_norm": 10.687676166681745, "kl": 0.474609375, "learning_rate": 2.7318718381112984e-07, "loss": 0.0005, "reward": 3.517017364501953, "reward_std": 0.2570660449564457, "rewards/final_reward": 1.442429098155051, "rewards/mask_iou_reward": 0.7212145490775255, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.5482673048973083, "rewards/thk_ans_format_reward": 0.984375, "step": 2155, "think_completion_length": 39.90625 }, { "clip_ratio": 0.0, "completion_length": 144.484375, "epoch": 3.6408094435075884, "grad_norm": 13.233145865190844, "kl": 0.587890625, "learning_rate": 2.7284991568296793e-07, "loss": 0.0006, "reward": 3.3401609659194946, "reward_std": 0.13841368909925222, "rewards/final_reward": 0.8899339333062407, "rewards/mask_iou_reward": 0.44496696665312035, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.3714109063148499, "rewards/thk_ans_format_reward": 0.984375, "step": 2156, "think_completion_length": 44.1875 }, { "clip_ratio": 0.0, "completion_length": 113.9375, "epoch": 3.642495784148398, "grad_norm": 6.341052709486836, "kl": 0.61328125, "learning_rate": 2.7251264755480607e-07, "loss": 0.0007, "reward": 2.9961708784103394, "reward_std": 0.06950226402841508, "rewards/final_reward": 1.7549071371048877, "rewards/mask_iou_reward": 0.8774535685524438, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9961709380149841, "rewards/thk_ans_format_reward": 1.0, "step": 2157, "think_completion_length": 44.53125 }, { "clip_ratio": 0.0, "completion_length": 126.359375, "epoch": 3.6441821247892072, "grad_norm": 9.988799852208986, "kl": 0.54296875, "learning_rate": 2.7217537942664415e-07, "loss": 0.0005, "reward": 3.432819962501526, "reward_std": 0.12654725369066, "rewards/final_reward": 1.3642316455310084, "rewards/mask_iou_reward": 0.6821158227655042, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.432819902896881, "rewards/thk_ans_format_reward": 1.0, "step": 2158, "think_completion_length": 44.53125 }, { "clip_ratio": 0.0, "completion_length": 111.859375, "epoch": 3.645868465430017, "grad_norm": 9.507871004604434, "kl": 0.572265625, "learning_rate": 2.718381112984823e-07, "loss": 0.0006, "reward": 3.5598703622817993, "reward_std": 0.06046362966299057, "rewards/final_reward": 1.534753833970691, "rewards/mask_iou_reward": 0.7673769169853455, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5598702430725098, "rewards/thk_ans_format_reward": 1.0, "step": 2159, "think_completion_length": 41.4375 }, { "clip_ratio": 0.0, "completion_length": 115.15625, "epoch": 3.6475548060708265, "grad_norm": 16.92326930924938, "kl": 0.556640625, "learning_rate": 2.715008431703204e-07, "loss": 0.0006, "reward": 3.11311411857605, "reward_std": 0.11794350296258926, "rewards/final_reward": 1.8428080052810945, "rewards/mask_iou_reward": 0.9214040026405472, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1131139993667603, "rewards/thk_ans_format_reward": 1.0, "step": 2160, "think_completion_length": 41.53125 }, { "clip_ratio": 0.0, "completion_length": 115.125, "epoch": 3.6492411467116357, "grad_norm": 107.29921425149882, "kl": 0.53515625, "learning_rate": 2.7116357504215847e-07, "loss": 0.0005, "reward": 3.799378275871277, "reward_std": 0.042371081188321114, "rewards/final_reward": 1.7386770075593936, "rewards/mask_iou_reward": 0.8693385037796968, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7993780970573425, "rewards/thk_ans_format_reward": 1.0, "step": 2161, "think_completion_length": 44.21875 }, { "clip_ratio": 0.0, "completion_length": 119.140625, "epoch": 3.6509274873524453, "grad_norm": 5.846962924465535, "kl": 0.564453125, "learning_rate": 2.708263069139966e-07, "loss": 0.0005, "reward": 3.193526268005371, "reward_std": 0.2762054353952408, "rewards/final_reward": 1.193118101721124, "rewards/mask_iou_reward": 0.596559050860562, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.2091514766216278, "rewards/thk_ans_format_reward": 1.0, "step": 2162, "think_completion_length": 44.5625 }, { "clip_ratio": 0.0, "completion_length": 116.25, "epoch": 3.6526138279932545, "grad_norm": 8.83884476539582, "kl": 0.54296875, "learning_rate": 2.7048903878583475e-07, "loss": 0.0005, "reward": 3.3878525495529175, "reward_std": 0.04130503349006176, "rewards/final_reward": 1.0849541941114242, "rewards/mask_iou_reward": 0.5424770970557121, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3878525495529175, "rewards/thk_ans_format_reward": 1.0, "step": 2163, "think_completion_length": 43.25 }, { "clip_ratio": 0.0, "completion_length": 133.65625, "epoch": 3.654300168634064, "grad_norm": 7.364941555058796, "kl": 0.62109375, "learning_rate": 2.7015177065767284e-07, "loss": 0.0006, "reward": 3.138849139213562, "reward_std": 0.16283194720745087, "rewards/final_reward": 1.3491245156995535, "rewards/mask_iou_reward": 0.6745622578497767, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1388492584228516, "rewards/thk_ans_format_reward": 1.0, "step": 2164, "think_completion_length": 42.25 }, { "clip_ratio": 0.0, "completion_length": 114.296875, "epoch": 3.6559865092748733, "grad_norm": 7.3929923391392975, "kl": 0.515625, "learning_rate": 2.698145025295109e-07, "loss": 0.0005, "reward": 3.4643598794937134, "reward_std": 0.03516603959724307, "rewards/final_reward": 1.3184992668719937, "rewards/mask_iou_reward": 0.6592496334359969, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4643598198890686, "rewards/thk_ans_format_reward": 1.0, "step": 2165, "think_completion_length": 43.46875 }, { "clip_ratio": 0.0, "completion_length": 114.234375, "epoch": 3.657672849915683, "grad_norm": 9.281403489514085, "kl": 0.5859375, "learning_rate": 2.6947723440134907e-07, "loss": 0.0006, "reward": 3.5559327602386475, "reward_std": 0.0707951420918107, "rewards/final_reward": 1.3471013670947625, "rewards/mask_iou_reward": 0.6735506835473812, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5559325814247131, "rewards/thk_ans_format_reward": 1.0, "step": 2166, "think_completion_length": 45.75 }, { "clip_ratio": 0.0, "completion_length": 114.125, "epoch": 3.6593591905564926, "grad_norm": 9.020458741884312, "kl": 0.58203125, "learning_rate": 2.691399662731872e-07, "loss": 0.0006, "reward": 3.6217806339263916, "reward_std": 0.22465556859970093, "rewards/final_reward": 1.7130859668021712, "rewards/mask_iou_reward": 0.8565429834010856, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6217804551124573, "rewards/thk_ans_format_reward": 1.0, "step": 2167, "think_completion_length": 44.0 }, { "clip_ratio": 0.0, "completion_length": 114.15625, "epoch": 3.661045531197302, "grad_norm": 33.60697679015086, "kl": 0.703125, "learning_rate": 2.6880269814502524e-07, "loss": 0.0007, "reward": 3.453278422355652, "reward_std": 0.10349838621914387, "rewards/final_reward": 1.295618368161848, "rewards/mask_iou_reward": 0.647809184080924, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4532784223556519, "rewards/thk_ans_format_reward": 1.0, "step": 2168, "think_completion_length": 44.5 }, { "clip_ratio": 0.0, "completion_length": 111.484375, "epoch": 3.6627318718381114, "grad_norm": 20.92072593765282, "kl": 0.837890625, "learning_rate": 2.684654300168634e-07, "loss": 0.0008, "reward": 3.924134373664856, "reward_std": 0.006402852479368448, "rewards/final_reward": 1.9268632408703417, "rewards/mask_iou_reward": 0.9634316204351708, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.924134373664856, "rewards/thk_ans_format_reward": 1.0, "step": 2169, "think_completion_length": 40.53125 }, { "clip_ratio": 0.0, "completion_length": 113.078125, "epoch": 3.6644182124789206, "grad_norm": 10.85155357900207, "kl": 0.583984375, "learning_rate": 2.681281618887015e-07, "loss": 0.0006, "reward": 3.402068257331848, "reward_std": 0.025438982993364334, "rewards/final_reward": 1.6983160717437744, "rewards/mask_iou_reward": 0.8491580358718872, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4020682573318481, "rewards/thk_ans_format_reward": 1.0, "step": 2170, "think_completion_length": 43.6875 }, { "clip_ratio": 0.0, "completion_length": 123.453125, "epoch": 3.6661045531197303, "grad_norm": 6.932460295196636, "kl": 0.6640625, "learning_rate": 2.677908937605396e-07, "loss": 0.0007, "reward": 3.6236302852630615, "reward_std": 0.2714125607162714, "rewards/final_reward": 1.7773782817528996, "rewards/mask_iou_reward": 0.8886891408764498, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6236302852630615, "rewards/thk_ans_format_reward": 1.0, "step": 2171, "think_completion_length": 45.15625 }, { "clip_ratio": 0.0, "completion_length": 117.15625, "epoch": 3.6677908937605395, "grad_norm": 8.724160078814055, "kl": 0.68359375, "learning_rate": 2.674536256323777e-07, "loss": 0.0007, "reward": 3.245076298713684, "reward_std": 0.17124823480844498, "rewards/final_reward": 1.6967053509231722, "rewards/mask_iou_reward": 0.8483526754615861, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.245076298713684, "rewards/thk_ans_format_reward": 1.0, "step": 2172, "think_completion_length": 44.9375 }, { "clip_ratio": 0.0, "completion_length": 118.140625, "epoch": 3.669477234401349, "grad_norm": 10.374725265579276, "kl": 0.6015625, "learning_rate": 2.6711635750421584e-07, "loss": 0.0006, "reward": 3.354717493057251, "reward_std": 0.0516207218170166, "rewards/final_reward": 1.6412412625450845, "rewards/mask_iou_reward": 0.8206206312725423, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.354717493057251, "rewards/thk_ans_format_reward": 1.0, "step": 2173, "think_completion_length": 45.75 }, { "clip_ratio": 0.0, "completion_length": 115.359375, "epoch": 3.6711635750421587, "grad_norm": 10.050664184936608, "kl": 0.5703125, "learning_rate": 2.66779089376054e-07, "loss": 0.0006, "reward": 3.3863418102264404, "reward_std": 0.05082100164145231, "rewards/final_reward": 1.3180649394657395, "rewards/mask_iou_reward": 0.6590324697328698, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3863418102264404, "rewards/thk_ans_format_reward": 1.0, "step": 2174, "think_completion_length": 45.65625 }, { "clip_ratio": 0.0, "completion_length": 152.484375, "epoch": 3.672849915682968, "grad_norm": 12.769801192651933, "kl": 0.50390625, "learning_rate": 2.6644182124789206e-07, "loss": 0.0005, "reward": 3.61961829662323, "reward_std": 0.2114700749516487, "rewards/final_reward": 1.8251791289417982, "rewards/mask_iou_reward": 0.9125895644708991, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6196181774139404, "rewards/thk_ans_format_reward": 1.0, "step": 2175, "think_completion_length": 45.8125 }, { "clip_ratio": 0.0, "completion_length": 133.203125, "epoch": 3.6745362563237776, "grad_norm": 17.69803214425245, "kl": 0.56640625, "learning_rate": 2.661045531197302e-07, "loss": 0.0006, "reward": 3.3327629566192627, "reward_std": 0.2733706757426262, "rewards/final_reward": 1.1473787205170702, "rewards/mask_iou_reward": 0.5736893602585351, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3327630162239075, "rewards/thk_ans_format_reward": 1.0, "step": 2176, "think_completion_length": 40.15625 }, { "clip_ratio": 0.0, "completion_length": 115.328125, "epoch": 3.6762225969645868, "grad_norm": 9.607761168239486, "kl": 0.55078125, "learning_rate": 2.657672849915683e-07, "loss": 0.0006, "reward": 3.2653400897979736, "reward_std": 0.3390034884214401, "rewards/final_reward": 0.8602963584102136, "rewards/mask_iou_reward": 0.4301481792051068, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2653402090072632, "rewards/thk_ans_format_reward": 1.0, "step": 2177, "think_completion_length": 44.59375 }, { "clip_ratio": 0.0, "completion_length": 169.09375, "epoch": 3.6779089376053964, "grad_norm": 7.861388433142454, "kl": 0.470703125, "learning_rate": 2.654300168634064e-07, "loss": 0.0005, "reward": 3.666589379310608, "reward_std": 0.0676095001399517, "rewards/final_reward": 1.5565185785082936, "rewards/mask_iou_reward": 0.7782592892541468, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6665893197059631, "rewards/thk_ans_format_reward": 1.0, "step": 2178, "think_completion_length": 44.46875 }, { "clip_ratio": 0.0, "completion_length": 114.765625, "epoch": 3.6795952782462056, "grad_norm": 6.506833566530017, "kl": 0.5546875, "learning_rate": 2.650927487352445e-07, "loss": 0.0006, "reward": 3.3174102306365967, "reward_std": 0.09125644341111183, "rewards/final_reward": 1.274266532450679, "rewards/mask_iou_reward": 0.6371332662253395, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3174102306365967, "rewards/thk_ans_format_reward": 1.0, "step": 2179, "think_completion_length": 41.46875 }, { "clip_ratio": 0.0, "completion_length": 113.859375, "epoch": 3.681281618887015, "grad_norm": 16.964483085595624, "kl": 0.56640625, "learning_rate": 2.6475548060708266e-07, "loss": 0.0006, "reward": 3.723142385482788, "reward_std": 0.08161863312125206, "rewards/final_reward": 1.7507960958907227, "rewards/mask_iou_reward": 0.8753980479453614, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7231422066688538, "rewards/thk_ans_format_reward": 1.0, "step": 2180, "think_completion_length": 45.34375 }, { "clip_ratio": 0.0, "completion_length": 113.96875, "epoch": 3.682967959527825, "grad_norm": 8.963115232354015, "kl": 0.5625, "learning_rate": 2.644182124789207e-07, "loss": 0.0006, "reward": 2.962601661682129, "reward_std": 0.1202041245996952, "rewards/final_reward": 1.2414144244462615, "rewards/mask_iou_reward": 0.6207072122231307, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9626017510890961, "rewards/thk_ans_format_reward": 1.0, "step": 2181, "think_completion_length": 44.6875 }, { "clip_ratio": 0.0, "completion_length": 109.953125, "epoch": 3.684654300168634, "grad_norm": 7.8568867421636375, "kl": 0.5703125, "learning_rate": 2.6408094435075883e-07, "loss": 0.0006, "reward": 3.705932855606079, "reward_std": 0.15980882477015257, "rewards/final_reward": 1.6566305434563295, "rewards/mask_iou_reward": 0.8283152717281648, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7059327960014343, "rewards/thk_ans_format_reward": 1.0, "step": 2182, "think_completion_length": 38.125 }, { "clip_ratio": 0.0, "completion_length": 139.078125, "epoch": 3.6863406408094432, "grad_norm": 26.061102769842314, "kl": 0.55078125, "learning_rate": 2.63743676222597e-07, "loss": 0.0005, "reward": 3.373674750328064, "reward_std": 0.23157277703285217, "rewards/final_reward": 1.3468981540470226, "rewards/mask_iou_reward": 0.6734490770235113, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.373674750328064, "rewards/thk_ans_format_reward": 1.0, "step": 2183, "think_completion_length": 46.90625 }, { "clip_ratio": 0.0, "completion_length": 135.046875, "epoch": 3.688026981450253, "grad_norm": 6.378374390223215, "kl": 0.56640625, "learning_rate": 2.6340640809443506e-07, "loss": 0.0006, "reward": 3.674358606338501, "reward_std": 0.19893009960651398, "rewards/final_reward": 1.5619072067565283, "rewards/mask_iou_reward": 0.7809536033782641, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.674358606338501, "rewards/thk_ans_format_reward": 1.0, "step": 2184, "think_completion_length": 41.625 }, { "clip_ratio": 0.0, "completion_length": 113.375, "epoch": 3.6897133220910625, "grad_norm": 5.631948058301189, "kl": 0.5625, "learning_rate": 2.6306913996627315e-07, "loss": 0.0006, "reward": 2.777292251586914, "reward_std": 0.03504425939172506, "rewards/final_reward": 0.5667150524643376, "rewards/mask_iou_reward": 0.2833575262321688, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7772921919822693, "rewards/thk_ans_format_reward": 1.0, "step": 2185, "think_completion_length": 43.46875 }, { "clip_ratio": 0.0, "completion_length": 107.890625, "epoch": 3.6913996627318717, "grad_norm": 6.357830145895411, "kl": 0.560546875, "learning_rate": 2.627318718381113e-07, "loss": 0.0006, "reward": 3.7768133878707886, "reward_std": 0.148924196138978, "rewards/final_reward": 1.7280991344863805, "rewards/mask_iou_reward": 0.8640495672431903, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7768135070800781, "rewards/thk_ans_format_reward": 1.0, "step": 2186, "think_completion_length": 35.75 }, { "clip_ratio": 0.0, "completion_length": 112.046875, "epoch": 3.6930860033726813, "grad_norm": 17.68862850386764, "kl": 0.54296875, "learning_rate": 2.6239460370994943e-07, "loss": 0.0006, "reward": 3.8915481567382812, "reward_std": 0.004744681587908417, "rewards/final_reward": 1.9137755741846734, "rewards/mask_iou_reward": 0.9568877870923367, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.891547977924347, "rewards/thk_ans_format_reward": 1.0, "step": 2187, "think_completion_length": 42.9375 }, { "clip_ratio": 0.0, "completion_length": 142.5625, "epoch": 3.694772344013491, "grad_norm": 6.4891286437355395, "kl": 0.537109375, "learning_rate": 2.620573355817875e-07, "loss": 0.0005, "reward": 3.456454873085022, "reward_std": 0.15797370299696922, "rewards/final_reward": 1.5738219841745664, "rewards/mask_iou_reward": 0.7869109920872832, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4564548134803772, "rewards/thk_ans_format_reward": 1.0, "step": 2188, "think_completion_length": 42.21875 }, { "clip_ratio": 0.0, "completion_length": 113.28125, "epoch": 3.6964586846543, "grad_norm": 7.514792783715896, "kl": 0.5205078125, "learning_rate": 2.617200674536256e-07, "loss": 0.0005, "reward": 3.180192708969116, "reward_std": 0.034278427017852664, "rewards/final_reward": 0.8698045692364691, "rewards/mask_iou_reward": 0.43490228461823455, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1801927089691162, "rewards/thk_ans_format_reward": 1.0, "step": 2189, "think_completion_length": 43.53125 }, { "clip_ratio": 0.0, "completion_length": 108.21875, "epoch": 3.6981450252951094, "grad_norm": 6.422281694154933, "kl": 0.56640625, "learning_rate": 2.6138279932546375e-07, "loss": 0.0006, "reward": 3.884060502052307, "reward_std": 0.0026781876804307103, "rewards/final_reward": 1.8500053453206569, "rewards/mask_iou_reward": 0.9250026726603284, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8840603828430176, "rewards/thk_ans_format_reward": 1.0, "step": 2190, "think_completion_length": 40.9375 }, { "clip_ratio": 0.0, "completion_length": 114.625, "epoch": 3.699831365935919, "grad_norm": 9.07015849728561, "kl": 0.59375, "learning_rate": 2.6104553119730183e-07, "loss": 0.0006, "reward": 3.3919734954833984, "reward_std": 0.049376328475773335, "rewards/final_reward": 1.2875744224146906, "rewards/mask_iou_reward": 0.6437872112073453, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.391973614692688, "rewards/thk_ans_format_reward": 1.0, "step": 2191, "think_completion_length": 43.40625 }, { "clip_ratio": 0.0, "completion_length": 113.125, "epoch": 3.7015177065767286, "grad_norm": 36.21098255396208, "kl": 0.57421875, "learning_rate": 2.6070826306913997e-07, "loss": 0.0006, "reward": 3.5568835735321045, "reward_std": 0.08409557677805424, "rewards/final_reward": 1.5891457329541123, "rewards/mask_iou_reward": 0.7945728664770562, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.556883454322815, "rewards/thk_ans_format_reward": 1.0, "step": 2192, "think_completion_length": 42.125 }, { "clip_ratio": 0.0, "completion_length": 109.671875, "epoch": 3.703204047217538, "grad_norm": 6.378184318212264, "kl": 0.54296875, "learning_rate": 2.6037099494097806e-07, "loss": 0.0005, "reward": 3.353445529937744, "reward_std": 0.004928447189740837, "rewards/final_reward": 1.7434356803338902, "rewards/mask_iou_reward": 0.8717178401669451, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3534456193447113, "rewards/thk_ans_format_reward": 1.0, "step": 2193, "think_completion_length": 39.78125 }, { "clip_ratio": 0.0, "completion_length": 148.046875, "epoch": 3.7048903878583475, "grad_norm": 31.98309447729436, "kl": 0.59765625, "learning_rate": 2.6003372681281615e-07, "loss": 0.0006, "reward": 3.554110288619995, "reward_std": 0.0832400880753994, "rewards/final_reward": 1.1983655867542573, "rewards/mask_iou_reward": 0.5991827933771287, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5541102886199951, "rewards/thk_ans_format_reward": 1.0, "step": 2194, "think_completion_length": 40.09375 }, { "clip_ratio": 0.0, "completion_length": 108.96875, "epoch": 3.706576728499157, "grad_norm": 16.719532672014108, "kl": 0.591796875, "learning_rate": 2.596964586846543e-07, "loss": 0.0005, "reward": 3.358941078186035, "reward_std": 0.03364470507949591, "rewards/final_reward": 1.0305207266722358, "rewards/mask_iou_reward": 0.5152603633361179, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.35894113779068, "rewards/thk_ans_format_reward": 1.0, "step": 2195, "think_completion_length": 36.59375 }, { "clip_ratio": 0.0, "completion_length": 176.15625, "epoch": 3.7082630691399663, "grad_norm": 9.148044836164983, "kl": 0.494140625, "learning_rate": 2.5935919055649243e-07, "loss": 0.0005, "reward": 3.2433149814605713, "reward_std": 0.3317317571491003, "rewards/final_reward": 1.6999731855188098, "rewards/mask_iou_reward": 0.8499865927594049, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2433151006698608, "rewards/thk_ans_format_reward": 1.0, "step": 2196, "think_completion_length": 39.90625 }, { "clip_ratio": 0.0, "completion_length": 110.71875, "epoch": 3.7099494097807755, "grad_norm": 11.166676327464593, "kl": 0.564453125, "learning_rate": 2.5902192242833046e-07, "loss": 0.0006, "reward": 3.205594062805176, "reward_std": 0.14426006376743317, "rewards/final_reward": 1.4832082424943116, "rewards/mask_iou_reward": 0.7416041212471558, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.205594152212143, "rewards/thk_ans_format_reward": 1.0, "step": 2197, "think_completion_length": 42.4375 }, { "clip_ratio": 0.0, "completion_length": 109.875, "epoch": 3.711635750421585, "grad_norm": 17.63438941915533, "kl": 0.619140625, "learning_rate": 2.586846543001686e-07, "loss": 0.0006, "reward": 3.4435391426086426, "reward_std": 0.04925878718495369, "rewards/final_reward": 1.4857788899066038, "rewards/mask_iou_reward": 0.7428894449533019, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4435392022132874, "rewards/thk_ans_format_reward": 1.0, "step": 2198, "think_completion_length": 39.46875 }, { "clip_ratio": 0.0, "completion_length": 109.265625, "epoch": 3.7133220910623947, "grad_norm": 22.065547912115726, "kl": 0.53515625, "learning_rate": 2.5834738617200674e-07, "loss": 0.0005, "reward": 3.2958098649978638, "reward_std": 0.0769207589328289, "rewards/final_reward": 1.3118947208677025, "rewards/mask_iou_reward": 0.6559473604338513, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.295809805393219, "rewards/thk_ans_format_reward": 1.0, "step": 2199, "think_completion_length": 38.40625 }, { "clip_ratio": 0.0, "completion_length": 111.0625, "epoch": 3.715008431703204, "grad_norm": 9.77059697873224, "kl": 0.6171875, "learning_rate": 2.580101180438449e-07, "loss": 0.0006, "reward": 3.5691983699798584, "reward_std": 0.17080958746373653, "rewards/final_reward": 1.6634956217435797, "rewards/mask_iou_reward": 0.8317478108717898, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5691982507705688, "rewards/thk_ans_format_reward": 1.0, "step": 2200, "think_completion_length": 40.0 }, { "clip_ratio": 0.0, "completion_length": 97.078125, "epoch": 3.7166947723440136, "grad_norm": 6.208268504324163, "kl": 0.5556640625, "learning_rate": 2.576728499156829e-07, "loss": 0.0006, "reward": 3.8266072273254395, "reward_std": 0.15711436793208122, "rewards/final_reward": 1.767159090919887, "rewards/mask_iou_reward": 0.8835795454599435, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8266071677207947, "rewards/thk_ans_format_reward": 1.0, "step": 2201, "think_completion_length": 42.5 }, { "clip_ratio": 0.0, "completion_length": 107.171875, "epoch": 3.718381112984823, "grad_norm": 8.694354037283233, "kl": 0.662109375, "learning_rate": 2.5733558178752106e-07, "loss": 0.0007, "reward": 3.179361581802368, "reward_std": 0.030320387333631516, "rewards/final_reward": 1.3028335589093758, "rewards/mask_iou_reward": 0.6514167794546879, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.17936173081398, "rewards/thk_ans_format_reward": 1.0, "step": 2202, "think_completion_length": 36.9375 }, { "clip_ratio": 0.0, "completion_length": 112.0625, "epoch": 3.7200674536256324, "grad_norm": 12.425460690779902, "kl": 0.49609375, "learning_rate": 2.569983136593592e-07, "loss": 0.0005, "reward": 3.647360920906067, "reward_std": 0.06752203544601798, "rewards/final_reward": 1.86081172540908, "rewards/mask_iou_reward": 0.93040586270454, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6473609805107117, "rewards/thk_ans_format_reward": 1.0, "step": 2203, "think_completion_length": 38.9375 }, { "clip_ratio": 0.0, "completion_length": 113.796875, "epoch": 3.7217537942664416, "grad_norm": 20.256036715233673, "kl": 0.5859375, "learning_rate": 2.566610455311973e-07, "loss": 0.0006, "reward": 3.146657943725586, "reward_std": 0.23980345856398344, "rewards/final_reward": 1.6253581227571177, "rewards/mask_iou_reward": 0.8126790613785588, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.146657943725586, "rewards/thk_ans_format_reward": 1.0, "step": 2204, "think_completion_length": 44.0 }, { "clip_ratio": 0.0, "completion_length": 108.796875, "epoch": 3.7234401349072512, "grad_norm": 8.886069594999654, "kl": 0.541015625, "learning_rate": 2.5632377740303543e-07, "loss": 0.0006, "reward": 3.452314257621765, "reward_std": 0.07864137506112456, "rewards/final_reward": 1.2201046790815127, "rewards/mask_iou_reward": 0.6100523395407563, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4523141980171204, "rewards/thk_ans_format_reward": 1.0, "step": 2205, "think_completion_length": 37.625 }, { "clip_ratio": 0.0, "completion_length": 139.046875, "epoch": 3.725126475548061, "grad_norm": 11.426640569808292, "kl": 0.548828125, "learning_rate": 2.559865092748735e-07, "loss": 0.0005, "reward": 3.317999005317688, "reward_std": 0.25152764841914177, "rewards/final_reward": 1.0653466622048742, "rewards/mask_iou_reward": 0.5326733311024371, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3179990351200104, "rewards/thk_ans_format_reward": 1.0, "step": 2206, "think_completion_length": 39.96875 }, { "clip_ratio": 0.0, "completion_length": 175.09375, "epoch": 3.72681281618887, "grad_norm": 8.227125714328034, "kl": 0.490234375, "learning_rate": 2.556492411467116e-07, "loss": 0.0005, "reward": 3.541482925415039, "reward_std": 0.23475152254104614, "rewards/final_reward": 1.9007196628561678, "rewards/mask_iou_reward": 0.9503598314280839, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5414828062057495, "rewards/thk_ans_format_reward": 1.0, "step": 2207, "think_completion_length": 38.9375 }, { "clip_ratio": 0.0, "completion_length": 109.53125, "epoch": 3.7284991568296797, "grad_norm": 8.56115780454844, "kl": 0.595703125, "learning_rate": 2.5531197301854974e-07, "loss": 0.0006, "reward": 2.894770622253418, "reward_std": 0.37094295769929886, "rewards/final_reward": 0.772514029249296, "rewards/mask_iou_reward": 0.386257014624648, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8947706520557404, "rewards/thk_ans_format_reward": 1.0, "step": 2208, "think_completion_length": 38.875 }, { "clip_ratio": 0.0, "completion_length": 123.625, "epoch": 3.730185497470489, "grad_norm": 9.777623691175139, "kl": 0.537109375, "learning_rate": 2.549747048903879e-07, "loss": 0.0005, "reward": 3.601067543029785, "reward_std": 0.06892485357820988, "rewards/final_reward": 1.581348549336064, "rewards/mask_iou_reward": 0.790674274668032, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6010676622390747, "rewards/thk_ans_format_reward": 1.0, "step": 2209, "think_completion_length": 36.5 }, { "clip_ratio": 0.0, "completion_length": 111.734375, "epoch": 3.7318718381112985, "grad_norm": 12.77242033038978, "kl": 0.568359375, "learning_rate": 2.546374367622259e-07, "loss": 0.0006, "reward": 3.286626100540161, "reward_std": 0.1027615237981081, "rewards/final_reward": 1.4901105977761426, "rewards/mask_iou_reward": 0.7450552988880713, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2866259813308716, "rewards/thk_ans_format_reward": 1.0, "step": 2210, "think_completion_length": 41.875 }, { "clip_ratio": 0.0, "completion_length": 131.71875, "epoch": 3.7335581787521077, "grad_norm": 12.011241878322691, "kl": 0.525390625, "learning_rate": 2.5430016863406406e-07, "loss": 0.0006, "reward": 3.164780020713806, "reward_std": 0.05755174346268177, "rewards/final_reward": 0.8981543311624601, "rewards/mask_iou_reward": 0.44907716558123006, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1647798418998718, "rewards/thk_ans_format_reward": 1.0, "step": 2211, "think_completion_length": 39.53125 }, { "clip_ratio": 0.0, "completion_length": 105.03125, "epoch": 3.7352445193929174, "grad_norm": 8.407289252559087, "kl": 0.576171875, "learning_rate": 2.539629005059022e-07, "loss": 0.0006, "reward": 3.6150245666503906, "reward_std": 0.27539839781820774, "rewards/final_reward": 1.5987298278611233, "rewards/mask_iou_reward": 0.7993649139305616, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6150245666503906, "rewards/thk_ans_format_reward": 1.0, "step": 2212, "think_completion_length": 43.75 }, { "clip_ratio": 0.0, "completion_length": 112.71875, "epoch": 3.736930860033727, "grad_norm": 4.722739695611589, "kl": 0.556640625, "learning_rate": 2.5362563237774034e-07, "loss": 0.0007, "reward": 3.00023877620697, "reward_std": 0.08326515275985003, "rewards/final_reward": 0.3297335486369651, "rewards/mask_iou_reward": 0.16486677431848254, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0002387762069702, "rewards/thk_ans_format_reward": 1.0, "step": 2213, "think_completion_length": 39.40625 }, { "clip_ratio": 0.0, "completion_length": 116.078125, "epoch": 3.738617200674536, "grad_norm": 12.344920387140817, "kl": 0.6171875, "learning_rate": 2.5328836424957837e-07, "loss": 0.0006, "reward": 2.7376948595046997, "reward_std": 0.1573820672929287, "rewards/final_reward": 0.7704514279846209, "rewards/mask_iou_reward": 0.3852257139923104, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7376948893070221, "rewards/thk_ans_format_reward": 1.0, "step": 2214, "think_completion_length": 46.625 }, { "clip_ratio": 0.0, "completion_length": 106.953125, "epoch": 3.740303541315346, "grad_norm": 8.864111740080398, "kl": 0.611328125, "learning_rate": 2.529510961214165e-07, "loss": 0.0006, "reward": 3.425600290298462, "reward_std": 0.09412947855889797, "rewards/final_reward": 1.4392662550548065, "rewards/mask_iou_reward": 0.7196331275274033, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4256003499031067, "rewards/thk_ans_format_reward": 1.0, "step": 2215, "think_completion_length": 39.8125 }, { "clip_ratio": 0.0, "completion_length": 109.109375, "epoch": 3.741989881956155, "grad_norm": 9.86841034282967, "kl": 0.640625, "learning_rate": 2.5261382799325465e-07, "loss": 0.0006, "reward": 3.367143392562866, "reward_std": 0.13670575991272926, "rewards/final_reward": 1.6991905185967289, "rewards/mask_iou_reward": 0.8495952592983644, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3671433925628662, "rewards/thk_ans_format_reward": 1.0, "step": 2216, "think_completion_length": 39.6875 }, { "clip_ratio": 0.0, "completion_length": 111.9375, "epoch": 3.7436762225969646, "grad_norm": 7.511324151445277, "kl": 0.6416015625, "learning_rate": 2.5227655986509274e-07, "loss": 0.0006, "reward": 3.638978123664856, "reward_std": 0.25424132496118546, "rewards/final_reward": 1.3872458365062224, "rewards/mask_iou_reward": 0.6936229182531112, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6389780044555664, "rewards/thk_ans_format_reward": 1.0, "step": 2217, "think_completion_length": 42.90625 }, { "clip_ratio": 0.0, "completion_length": 115.984375, "epoch": 3.745362563237774, "grad_norm": 8.780601057458544, "kl": 0.533203125, "learning_rate": 2.5193929173693083e-07, "loss": 0.0005, "reward": 3.2593986988067627, "reward_std": 0.07271349988877773, "rewards/final_reward": 1.7570459376830176, "rewards/mask_iou_reward": 0.8785229688415088, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2593986988067627, "rewards/thk_ans_format_reward": 1.0, "step": 2218, "think_completion_length": 44.25 }, { "clip_ratio": 0.0, "completion_length": 115.078125, "epoch": 3.7470489038785835, "grad_norm": 7.846688741217049, "kl": 0.501953125, "learning_rate": 2.5160202360876897e-07, "loss": 0.0005, "reward": 3.4012067317962646, "reward_std": 0.22127216309309006, "rewards/final_reward": 1.4312873112016784, "rewards/mask_iou_reward": 0.7156436556008392, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4012067914009094, "rewards/thk_ans_format_reward": 1.0, "step": 2219, "think_completion_length": 36.53125 }, { "clip_ratio": 0.0, "completion_length": 111.0625, "epoch": 3.748735244519393, "grad_norm": 7.298265056389149, "kl": 0.55078125, "learning_rate": 2.5126475548060706e-07, "loss": 0.0006, "reward": 3.504394292831421, "reward_std": 0.18095969408750534, "rewards/final_reward": 1.443156090383158, "rewards/mask_iou_reward": 0.721578045191579, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5043942332267761, "rewards/thk_ans_format_reward": 1.0, "step": 2220, "think_completion_length": 40.59375 }, { "clip_ratio": 0.0, "completion_length": 154.484375, "epoch": 3.7504215851602023, "grad_norm": 6.222948569178547, "kl": 0.46875, "learning_rate": 2.509274873524452e-07, "loss": 0.0005, "reward": 3.4797459840774536, "reward_std": 0.07105998322367668, "rewards/final_reward": 1.6921433070142375, "rewards/mask_iou_reward": 0.8460716535071188, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.479745864868164, "rewards/thk_ans_format_reward": 1.0, "step": 2221, "think_completion_length": 41.28125 }, { "clip_ratio": 0.0, "completion_length": 195.125, "epoch": 3.752107925801012, "grad_norm": 9.67562564815868, "kl": 0.4443359375, "learning_rate": 2.505902192242833e-07, "loss": 0.0005, "reward": 3.7335736751556396, "reward_std": 0.1016400195658207, "rewards/final_reward": 1.7593110192348576, "rewards/mask_iou_reward": 0.8796555096174288, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7335737347602844, "rewards/thk_ans_format_reward": 1.0, "step": 2222, "think_completion_length": 43.53125 }, { "clip_ratio": 0.0, "completion_length": 147.40625, "epoch": 3.753794266441821, "grad_norm": 28.102773270256623, "kl": 0.517578125, "learning_rate": 2.5025295109612137e-07, "loss": 0.0005, "reward": 3.398337244987488, "reward_std": 0.06284121796488762, "rewards/final_reward": 1.1384011844333064, "rewards/mask_iou_reward": 0.5692005922166532, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.398337185382843, "rewards/thk_ans_format_reward": 1.0, "step": 2223, "think_completion_length": 41.15625 }, { "clip_ratio": 0.0, "completion_length": 107.734375, "epoch": 3.7554806070826308, "grad_norm": 8.641382424478827, "kl": 0.5625, "learning_rate": 2.499156829679595e-07, "loss": 0.0006, "reward": 3.4698305130004883, "reward_std": 0.09916340420022607, "rewards/final_reward": 1.4339345953372138, "rewards/mask_iou_reward": 0.7169672976686069, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4698305130004883, "rewards/thk_ans_format_reward": 1.0, "step": 2224, "think_completion_length": 36.90625 }, { "clip_ratio": 0.0, "completion_length": 147.28125, "epoch": 3.75716694772344, "grad_norm": 11.69530197812389, "kl": 0.490234375, "learning_rate": 2.4957841483979765e-07, "loss": 0.0005, "reward": 3.0470253229141235, "reward_std": 0.13745611906051636, "rewards/final_reward": 1.1631897760026755, "rewards/mask_iou_reward": 0.5815948880013377, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0470252931118011, "rewards/thk_ans_format_reward": 1.0, "step": 2225, "think_completion_length": 40.625 }, { "clip_ratio": 0.0, "completion_length": 114.09375, "epoch": 3.7588532883642496, "grad_norm": 7.5304097896928415, "kl": 0.66796875, "learning_rate": 2.4924114671163574e-07, "loss": 0.0007, "reward": 3.7693170309066772, "reward_std": 0.23819169402122498, "rewards/final_reward": 1.684009822965522, "rewards/mask_iou_reward": 0.842004911482761, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7693171501159668, "rewards/thk_ans_format_reward": 1.0, "step": 2226, "think_completion_length": 41.5 }, { "clip_ratio": 0.0, "completion_length": 140.84375, "epoch": 3.7605396290050592, "grad_norm": 22.00318643891571, "kl": 0.484375, "learning_rate": 2.489038785834739e-07, "loss": 0.0004, "reward": 3.6224844455718994, "reward_std": 0.16775443218648434, "rewards/final_reward": 1.5701185300553635, "rewards/mask_iou_reward": 0.7850592650276818, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6224846243858337, "rewards/thk_ans_format_reward": 1.0, "step": 2227, "think_completion_length": 45.15625 }, { "clip_ratio": 0.0, "completion_length": 108.21875, "epoch": 3.7622259696458684, "grad_norm": 7.513678492251124, "kl": 0.59375, "learning_rate": 2.4856661045531197e-07, "loss": 0.0006, "reward": 3.82713782787323, "reward_std": 0.016284896060824394, "rewards/final_reward": 1.7292593459223653, "rewards/mask_iou_reward": 0.8646296729611826, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8271378874778748, "rewards/thk_ans_format_reward": 1.0, "step": 2228, "think_completion_length": 38.0625 }, { "clip_ratio": 0.0, "completion_length": 117.671875, "epoch": 3.763912310286678, "grad_norm": 9.628506712225574, "kl": 0.54296875, "learning_rate": 2.4822934232715005e-07, "loss": 0.0005, "reward": 3.134037733078003, "reward_std": 0.08405065536499023, "rewards/final_reward": 1.1449944024136807, "rewards/mask_iou_reward": 0.5724972012068403, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1340378522872925, "rewards/thk_ans_format_reward": 1.0, "step": 2229, "think_completion_length": 44.5 }, { "clip_ratio": 0.0, "completion_length": 116.9375, "epoch": 3.7655986509274872, "grad_norm": 6.483859533323332, "kl": 0.544921875, "learning_rate": 2.478920741989882e-07, "loss": 0.0005, "reward": 2.9359426498413086, "reward_std": 0.03952119592577219, "rewards/final_reward": 1.0045149243772502, "rewards/mask_iou_reward": 0.5022574621886251, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.935942679643631, "rewards/thk_ans_format_reward": 1.0, "step": 2230, "think_completion_length": 47.1875 }, { "clip_ratio": 0.0, "completion_length": 123.0, "epoch": 3.767284991568297, "grad_norm": 6.321503986653633, "kl": 0.53125, "learning_rate": 2.475548060708263e-07, "loss": 0.0005, "reward": 3.5230026245117188, "reward_std": 0.07344697206281126, "rewards/final_reward": 1.4060531780160597, "rewards/mask_iou_reward": 0.7030265890080298, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.523002803325653, "rewards/thk_ans_format_reward": 1.0, "step": 2231, "think_completion_length": 42.125 }, { "clip_ratio": 0.0, "completion_length": 161.390625, "epoch": 3.768971332209106, "grad_norm": 11.485002134115911, "kl": 0.5146484375, "learning_rate": 2.4721753794266437e-07, "loss": 0.0005, "reward": 3.668249011039734, "reward_std": 0.028451272868551314, "rewards/final_reward": 1.675905328253724, "rewards/mask_iou_reward": 0.837952664126862, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6682490706443787, "rewards/thk_ans_format_reward": 1.0, "step": 2232, "think_completion_length": 39.65625 }, { "clip_ratio": 0.0, "completion_length": 125.515625, "epoch": 3.7706576728499157, "grad_norm": 7.764965076833885, "kl": 0.556640625, "learning_rate": 2.468802698145025e-07, "loss": 0.0006, "reward": 3.082284688949585, "reward_std": 0.13143670186400414, "rewards/final_reward": 0.6998492752706811, "rewards/mask_iou_reward": 0.34992463763534054, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0822848081588745, "rewards/thk_ans_format_reward": 1.0, "step": 2233, "think_completion_length": 45.46875 }, { "clip_ratio": 0.0, "completion_length": 121.1875, "epoch": 3.7723440134907253, "grad_norm": 6.845152442654978, "kl": 0.48828125, "learning_rate": 2.4654300168634065e-07, "loss": 0.0005, "reward": 3.307486414909363, "reward_std": 0.07110036723315716, "rewards/final_reward": 1.8586305476138003, "rewards/mask_iou_reward": 0.9293152738069002, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3074862957000732, "rewards/thk_ans_format_reward": 1.0, "step": 2234, "think_completion_length": 39.84375 }, { "clip_ratio": 0.0, "completion_length": 115.65625, "epoch": 3.7740303541315345, "grad_norm": 21.161976840363057, "kl": 0.546875, "learning_rate": 2.4620573355817874e-07, "loss": 0.0005, "reward": 3.489209532737732, "reward_std": 0.1913529559969902, "rewards/final_reward": 1.392710809261045, "rewards/mask_iou_reward": 0.6963554046305225, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4892095923423767, "rewards/thk_ans_format_reward": 1.0, "step": 2235, "think_completion_length": 40.0625 }, { "clip_ratio": 0.0, "completion_length": 110.1875, "epoch": 3.775716694772344, "grad_norm": 41.94328118116562, "kl": 0.568359375, "learning_rate": 2.458684654300169e-07, "loss": 0.0006, "reward": 3.7272164821624756, "reward_std": 0.10080359177663922, "rewards/final_reward": 1.5988337732937237, "rewards/mask_iou_reward": 0.7994168866468618, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7272164821624756, "rewards/thk_ans_format_reward": 1.0, "step": 2236, "think_completion_length": 39.125 }, { "clip_ratio": 0.0, "completion_length": 113.703125, "epoch": 3.7774030354131534, "grad_norm": 12.693738709933934, "kl": 0.5234375, "learning_rate": 2.4553119730185496e-07, "loss": 0.0005, "reward": 3.2859922647476196, "reward_std": 0.3157341778278351, "rewards/final_reward": 1.5607425408666726, "rewards/mask_iou_reward": 0.7803712704333363, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.3016172647476196, "rewards/thk_ans_format_reward": 1.0, "step": 2237, "think_completion_length": 38.71875 }, { "clip_ratio": 0.0, "completion_length": 130.984375, "epoch": 3.779089376053963, "grad_norm": 21.18898534386323, "kl": 0.55859375, "learning_rate": 2.451939291736931e-07, "loss": 0.0006, "reward": 3.6144726276397705, "reward_std": 0.05499284155666828, "rewards/final_reward": 1.5110642429760337, "rewards/mask_iou_reward": 0.7555321214880169, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6144728064537048, "rewards/thk_ans_format_reward": 1.0, "step": 2238, "think_completion_length": 36.03125 }, { "clip_ratio": 0.0, "completion_length": 124.296875, "epoch": 3.780775716694772, "grad_norm": 11.254064369218629, "kl": 0.576171875, "learning_rate": 2.448566610455312e-07, "loss": 0.0006, "reward": 3.332844614982605, "reward_std": 0.22398744896054268, "rewards/final_reward": 1.2413680375181946, "rewards/mask_iou_reward": 0.6206840187590973, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.332844614982605, "rewards/thk_ans_format_reward": 1.0, "step": 2239, "think_completion_length": 44.28125 }, { "clip_ratio": 0.0, "completion_length": 114.953125, "epoch": 3.782462057335582, "grad_norm": 12.948537225722228, "kl": 0.576171875, "learning_rate": 2.4451939291736933e-07, "loss": 0.0006, "reward": 3.6998353004455566, "reward_std": 0.02471212111413479, "rewards/final_reward": 1.7049763848571122, "rewards/mask_iou_reward": 0.8524881924285561, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6998353004455566, "rewards/thk_ans_format_reward": 1.0, "step": 2240, "think_completion_length": 43.65625 }, { "clip_ratio": 0.0, "completion_length": 120.390625, "epoch": 3.7841483979763915, "grad_norm": 10.909507483133451, "kl": 0.4892578125, "learning_rate": 2.441821247892074e-07, "loss": 0.0005, "reward": 3.4952789545059204, "reward_std": 0.03494591638445854, "rewards/final_reward": 1.1304556359104303, "rewards/mask_iou_reward": 0.5652278179552152, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.49527907371521, "rewards/thk_ans_format_reward": 1.0, "step": 2241, "think_completion_length": 49.46875 }, { "clip_ratio": 0.0, "completion_length": 163.375, "epoch": 3.7858347386172007, "grad_norm": 6.362501564088327, "kl": 0.4287109375, "learning_rate": 2.438448566610455e-07, "loss": 0.0004, "reward": 3.486288070678711, "reward_std": 0.2311484133824706, "rewards/final_reward": 1.6722780335729297, "rewards/mask_iou_reward": 0.8361390167864649, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4862881898880005, "rewards/thk_ans_format_reward": 1.0, "step": 2242, "think_completion_length": 42.625 }, { "clip_ratio": 0.0, "completion_length": 106.15625, "epoch": 3.78752107925801, "grad_norm": 23.489230128982506, "kl": 0.650390625, "learning_rate": 2.4350758853288365e-07, "loss": 0.0006, "reward": 3.7910631895065308, "reward_std": 0.031536445720121264, "rewards/final_reward": 1.8889532907636273, "rewards/mask_iou_reward": 0.9444766453818136, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7910633087158203, "rewards/thk_ans_format_reward": 1.0, "step": 2243, "think_completion_length": 35.8125 }, { "clip_ratio": 0.0, "completion_length": 112.546875, "epoch": 3.7892074198988195, "grad_norm": 8.982080291054826, "kl": 0.548828125, "learning_rate": 2.4317032040472173e-07, "loss": 0.0006, "reward": 3.8048187494277954, "reward_std": 0.06128034554421902, "rewards/final_reward": 1.9306495480027581, "rewards/mask_iou_reward": 0.9653247740013791, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.804818868637085, "rewards/thk_ans_format_reward": 1.0, "step": 2244, "think_completion_length": 38.875 }, { "clip_ratio": 0.0, "completion_length": 135.25, "epoch": 3.790893760539629, "grad_norm": 9.209614953926765, "kl": 0.525390625, "learning_rate": 2.428330522765598e-07, "loss": 0.0005, "reward": 3.5684866905212402, "reward_std": 0.06634041853249073, "rewards/final_reward": 1.7982114449418654, "rewards/mask_iou_reward": 0.8991057224709327, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5684868097305298, "rewards/thk_ans_format_reward": 1.0, "step": 2245, "think_completion_length": 39.5625 }, { "clip_ratio": 0.0, "completion_length": 162.703125, "epoch": 3.7925801011804383, "grad_norm": 7.330973313643778, "kl": 0.587890625, "learning_rate": 2.4249578414839796e-07, "loss": 0.0006, "reward": 3.0241518020629883, "reward_std": 0.0621052160859108, "rewards/final_reward": 0.7467765296325852, "rewards/mask_iou_reward": 0.3733882648162926, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.024151861667633, "rewards/thk_ans_format_reward": 1.0, "step": 2246, "think_completion_length": 44.25 }, { "clip_ratio": 0.0, "completion_length": 109.890625, "epoch": 3.794266441821248, "grad_norm": 6.687208782228291, "kl": 0.564453125, "learning_rate": 2.4215851602023605e-07, "loss": 0.0005, "reward": 3.8984771966934204, "reward_std": 0.0070637313183397055, "rewards/final_reward": 1.9246973970811951, "rewards/mask_iou_reward": 0.9623486985405976, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8984771966934204, "rewards/thk_ans_format_reward": 1.0, "step": 2247, "think_completion_length": 38.78125 }, { "clip_ratio": 0.0, "completion_length": 135.359375, "epoch": 3.7959527824620576, "grad_norm": 9.61095483768434, "kl": 0.603515625, "learning_rate": 2.418212478920742e-07, "loss": 0.0006, "reward": 3.569265365600586, "reward_std": 0.0774321025237441, "rewards/final_reward": 1.8779691392941031, "rewards/mask_iou_reward": 0.9389845696470516, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5692653059959412, "rewards/thk_ans_format_reward": 1.0, "step": 2248, "think_completion_length": 39.875 }, { "clip_ratio": 0.0, "completion_length": 114.390625, "epoch": 3.7976391231028668, "grad_norm": 9.22119229655192, "kl": 0.53515625, "learning_rate": 2.414839797639123e-07, "loss": 0.0005, "reward": 3.323352098464966, "reward_std": 0.22597427666187286, "rewards/final_reward": 1.6997667859197665, "rewards/mask_iou_reward": 0.8498833929598832, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3233520984649658, "rewards/thk_ans_format_reward": 1.0, "step": 2249, "think_completion_length": 42.1875 }, { "clip_ratio": 0.0, "completion_length": 118.4375, "epoch": 3.799325463743676, "grad_norm": 14.710195243504126, "kl": 0.52734375, "learning_rate": 2.411467116357504e-07, "loss": 0.0005, "reward": 3.4668972492218018, "reward_std": 0.14925647154450417, "rewards/final_reward": 1.6721082015758792, "rewards/mask_iou_reward": 0.8360541007879396, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4668973684310913, "rewards/thk_ans_format_reward": 1.0, "step": 2250, "think_completion_length": 41.90625 }, { "clip_ratio": 0.0, "completion_length": 98.21875, "epoch": 3.8010118043844856, "grad_norm": 8.207460656492335, "kl": 0.552734375, "learning_rate": 2.408094435075885e-07, "loss": 0.0006, "reward": 3.2669492959976196, "reward_std": 0.20546810171799734, "rewards/final_reward": 1.0126203292625804, "rewards/mask_iou_reward": 0.5063101646312902, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2669492959976196, "rewards/thk_ans_format_reward": 1.0, "step": 2251, "think_completion_length": 38.375 }, { "clip_ratio": 0.0, "completion_length": 139.8125, "epoch": 3.8026981450252952, "grad_norm": 12.870806106607096, "kl": 3.0625, "learning_rate": 2.4047217537942665e-07, "loss": 0.003, "reward": 3.3490335941314697, "reward_std": 0.19104180857539177, "rewards/final_reward": 1.4805655172421521, "rewards/mask_iou_reward": 0.7402827586210761, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.3646584749221802, "rewards/thk_ans_format_reward": 1.0, "step": 2252, "think_completion_length": 40.53125 }, { "clip_ratio": 0.0, "completion_length": 113.0625, "epoch": 3.8043844856661044, "grad_norm": 7.855936861729753, "kl": 0.6640625, "learning_rate": 2.4013490725126473e-07, "loss": 0.0007, "reward": 3.355563998222351, "reward_std": 0.1781761646270752, "rewards/final_reward": 1.064294652552745, "rewards/mask_iou_reward": 0.5321473262763725, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3555639386177063, "rewards/thk_ans_format_reward": 1.0, "step": 2253, "think_completion_length": 45.09375 }, { "clip_ratio": 0.0, "completion_length": 111.078125, "epoch": 3.806070826306914, "grad_norm": 7.972201657350866, "kl": 0.56640625, "learning_rate": 2.3979763912310287e-07, "loss": 0.0006, "reward": 3.093757748603821, "reward_std": 0.11396299209445715, "rewards/final_reward": 0.8799707201905502, "rewards/mask_iou_reward": 0.4399853600952751, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0937578082084656, "rewards/thk_ans_format_reward": 1.0, "step": 2254, "think_completion_length": 41.875 }, { "clip_ratio": 0.0, "completion_length": 109.46875, "epoch": 3.8077571669477237, "grad_norm": 14.963502770880439, "kl": 0.640625, "learning_rate": 2.3946037099494096e-07, "loss": 0.0006, "reward": 3.5387972593307495, "reward_std": 0.12345702201128006, "rewards/final_reward": 1.6928634493897015, "rewards/mask_iou_reward": 0.8464317246948507, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.538797378540039, "rewards/thk_ans_format_reward": 1.0, "step": 2255, "think_completion_length": 37.84375 }, { "clip_ratio": 0.0, "completion_length": 110.71875, "epoch": 3.809443507588533, "grad_norm": 6.000778542491872, "kl": 0.62109375, "learning_rate": 2.391231028667791e-07, "loss": 0.0006, "reward": 3.505826711654663, "reward_std": 0.14541307091712952, "rewards/final_reward": 1.323766110955759, "rewards/mask_iou_reward": 0.6618830554778795, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5058266520500183, "rewards/thk_ans_format_reward": 1.0, "step": 2256, "think_completion_length": 37.875 }, { "clip_ratio": 0.0, "completion_length": 129.0, "epoch": 3.811129848229342, "grad_norm": 7.77969270674984, "kl": 0.47265625, "learning_rate": 2.387858347386172e-07, "loss": 0.0005, "reward": 3.2499868869781494, "reward_std": 0.046357049606740475, "rewards/final_reward": 1.153350762684806, "rewards/mask_iou_reward": 0.576675381342403, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.249986857175827, "rewards/thk_ans_format_reward": 1.0, "step": 2257, "think_completion_length": 46.46875 }, { "clip_ratio": 0.0, "completion_length": 116.171875, "epoch": 3.8128161888701517, "grad_norm": 5.7795329042659525, "kl": 0.537109375, "learning_rate": 2.384485666104553e-07, "loss": 0.0005, "reward": 3.6921584606170654, "reward_std": 0.0910279038362205, "rewards/final_reward": 1.607268500317296, "rewards/mask_iou_reward": 0.803634250158648, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6921584606170654, "rewards/thk_ans_format_reward": 1.0, "step": 2258, "think_completion_length": 46.0 }, { "clip_ratio": 0.0, "completion_length": 129.0, "epoch": 3.8145025295109614, "grad_norm": 44.85207032598342, "kl": 0.4765625, "learning_rate": 2.3811129848229342e-07, "loss": 0.0005, "reward": 3.449580430984497, "reward_std": 0.15453584492206573, "rewards/final_reward": 1.455731098691922, "rewards/mask_iou_reward": 0.727865549345961, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4495803713798523, "rewards/thk_ans_format_reward": 1.0, "step": 2259, "think_completion_length": 45.6875 }, { "clip_ratio": 0.0, "completion_length": 114.578125, "epoch": 3.8161888701517706, "grad_norm": 8.150016547919764, "kl": 0.5, "learning_rate": 2.377740303541315e-07, "loss": 0.0005, "reward": 3.3544111251831055, "reward_std": 0.04027549549937248, "rewards/final_reward": 1.8442765117650843, "rewards/mask_iou_reward": 0.9221382558825422, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3544110655784607, "rewards/thk_ans_format_reward": 1.0, "step": 2260, "think_completion_length": 40.28125 }, { "clip_ratio": 0.0, "completion_length": 111.1875, "epoch": 3.81787521079258, "grad_norm": 22.799975985239875, "kl": 0.5546875, "learning_rate": 2.3743676222596964e-07, "loss": 0.0006, "reward": 3.2492820024490356, "reward_std": 0.0645350944250822, "rewards/final_reward": 0.7860365323414302, "rewards/mask_iou_reward": 0.3930182661707151, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2492821216583252, "rewards/thk_ans_format_reward": 1.0, "step": 2261, "think_completion_length": 40.03125 }, { "clip_ratio": 0.0, "completion_length": 128.125, "epoch": 3.8195615514333894, "grad_norm": 9.297461895727002, "kl": 0.6875, "learning_rate": 2.3709949409780776e-07, "loss": 0.0007, "reward": 3.1288259029388428, "reward_std": 0.10558873787522316, "rewards/final_reward": 0.5146808146562, "rewards/mask_iou_reward": 0.2573404073281, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1288259029388428, "rewards/thk_ans_format_reward": 1.0, "step": 2262, "think_completion_length": 43.15625 }, { "clip_ratio": 0.0, "completion_length": 111.140625, "epoch": 3.821247892074199, "grad_norm": 6.6976524641599875, "kl": 0.564453125, "learning_rate": 2.3676222596964585e-07, "loss": 0.0005, "reward": 3.7508952617645264, "reward_std": 0.07979346811771393, "rewards/final_reward": 1.9381548986924892, "rewards/mask_iou_reward": 0.9690774493462446, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7508953213691711, "rewards/thk_ans_format_reward": 1.0, "step": 2263, "think_completion_length": 37.75 }, { "clip_ratio": 0.0, "completion_length": 115.53125, "epoch": 3.822934232715008, "grad_norm": 6.993854221440401, "kl": 0.55078125, "learning_rate": 2.3642495784148399e-07, "loss": 0.0005, "reward": 3.3509509563446045, "reward_std": 0.08846403658390045, "rewards/final_reward": 0.9924140661206481, "rewards/mask_iou_reward": 0.49620703306032404, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3509510159492493, "rewards/thk_ans_format_reward": 1.0, "step": 2264, "think_completion_length": 40.625 }, { "clip_ratio": 0.0, "completion_length": 110.0625, "epoch": 3.824620573355818, "grad_norm": 7.1760436728186106, "kl": 0.61328125, "learning_rate": 2.3608768971332207e-07, "loss": 0.0006, "reward": 3.4479448795318604, "reward_std": 0.022803470492362976, "rewards/final_reward": 1.0529762239628047, "rewards/mask_iou_reward": 0.5264881119814023, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4479448199272156, "rewards/thk_ans_format_reward": 1.0, "step": 2265, "think_completion_length": 44.46875 }, { "clip_ratio": 0.0, "completion_length": 109.96875, "epoch": 3.8263069139966275, "grad_norm": 15.65109177458448, "kl": 0.572265625, "learning_rate": 2.357504215851602e-07, "loss": 0.0006, "reward": 3.2824543714523315, "reward_std": 0.06921002082526684, "rewards/final_reward": 1.439779531366369, "rewards/mask_iou_reward": 0.7198897656831845, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2824545502662659, "rewards/thk_ans_format_reward": 1.0, "step": 2266, "think_completion_length": 42.21875 }, { "clip_ratio": 0.0, "completion_length": 110.453125, "epoch": 3.8279932546374367, "grad_norm": 16.79945725064245, "kl": 0.568359375, "learning_rate": 2.354131534569983e-07, "loss": 0.0006, "reward": 3.217270016670227, "reward_std": 0.03682664316147566, "rewards/final_reward": 1.5846480653921238, "rewards/mask_iou_reward": 0.7923240326960619, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.217270016670227, "rewards/thk_ans_format_reward": 1.0, "step": 2267, "think_completion_length": 38.65625 }, { "clip_ratio": 0.0, "completion_length": 125.5625, "epoch": 3.8296795952782463, "grad_norm": 8.20014333565937, "kl": 0.587890625, "learning_rate": 2.3507588532883641e-07, "loss": 0.0006, "reward": 3.5661516189575195, "reward_std": 0.022400468587875366, "rewards/final_reward": 1.853252251249863, "rewards/mask_iou_reward": 0.9266261256249315, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.56615149974823, "rewards/thk_ans_format_reward": 1.0, "step": 2268, "think_completion_length": 37.03125 }, { "clip_ratio": 0.0, "completion_length": 112.015625, "epoch": 3.8313659359190555, "grad_norm": 30.934744592352697, "kl": 0.5859375, "learning_rate": 2.3473861720067453e-07, "loss": 0.0006, "reward": 3.341153144836426, "reward_std": 0.27450861036777496, "rewards/final_reward": 1.215352754567551, "rewards/mask_iou_reward": 0.6076763772837755, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3411532640457153, "rewards/thk_ans_format_reward": 1.0, "step": 2269, "think_completion_length": 40.03125 }, { "clip_ratio": 0.0, "completion_length": 166.171875, "epoch": 3.833052276559865, "grad_norm": 12.834656985151206, "kl": 0.48828125, "learning_rate": 2.3440134907251264e-07, "loss": 0.0005, "reward": 3.2810736894607544, "reward_std": 0.18031561793759465, "rewards/final_reward": 0.9614180023944365, "rewards/mask_iou_reward": 0.48070900119721827, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2810736298561096, "rewards/thk_ans_format_reward": 1.0, "step": 2270, "think_completion_length": 42.0625 }, { "clip_ratio": 0.0, "completion_length": 111.140625, "epoch": 3.8347386172006743, "grad_norm": 14.12803405633131, "kl": 0.55078125, "learning_rate": 2.3406408094435076e-07, "loss": 0.0006, "reward": 3.6309303045272827, "reward_std": 0.07168065011501312, "rewards/final_reward": 1.728945760123278, "rewards/mask_iou_reward": 0.864472880061639, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6309301853179932, "rewards/thk_ans_format_reward": 1.0, "step": 2271, "think_completion_length": 40.09375 }, { "clip_ratio": 0.0, "completion_length": 127.921875, "epoch": 3.836424957841484, "grad_norm": 5.372312685751642, "kl": 0.5244140625, "learning_rate": 2.3372681281618887e-07, "loss": 0.0005, "reward": 3.1526389122009277, "reward_std": 0.2288635354489088, "rewards/final_reward": 1.4706292807385708, "rewards/mask_iou_reward": 0.7353146403692854, "rewards/sam_format_reward": 0.953125, "rewards/sam_reward_func_ultra": 1.1995137929916382, "rewards/thk_ans_format_reward": 1.0, "step": 2272, "think_completion_length": 38.21875 }, { "clip_ratio": 0.0, "completion_length": 112.765625, "epoch": 3.8381112984822936, "grad_norm": 97.88424337284546, "kl": 0.525390625, "learning_rate": 2.3338954468802696e-07, "loss": 0.0005, "reward": 3.5338401794433594, "reward_std": 0.19119788333773613, "rewards/final_reward": 1.326255627713599, "rewards/mask_iou_reward": 0.6631278138567995, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5338401794433594, "rewards/thk_ans_format_reward": 1.0, "step": 2273, "think_completion_length": 43.03125 }, { "clip_ratio": 0.0, "completion_length": 118.765625, "epoch": 3.839797639123103, "grad_norm": 12.919983043098979, "kl": 0.55078125, "learning_rate": 2.330522765598651e-07, "loss": 0.0005, "reward": 3.437851667404175, "reward_std": 0.02033051522448659, "rewards/final_reward": 1.091737954300925, "rewards/mask_iou_reward": 0.5458689771504625, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4378515779972076, "rewards/thk_ans_format_reward": 1.0, "step": 2274, "think_completion_length": 42.96875 }, { "clip_ratio": 0.0, "completion_length": 139.890625, "epoch": 3.8414839797639124, "grad_norm": 112.94144962199447, "kl": 0.521484375, "learning_rate": 2.3271500843170318e-07, "loss": 0.0005, "reward": 3.4854648113250732, "reward_std": 0.0527753047645092, "rewards/final_reward": 1.4952481048274144, "rewards/mask_iou_reward": 0.7476240524137072, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4854648113250732, "rewards/thk_ans_format_reward": 1.0, "step": 2275, "think_completion_length": 42.53125 }, { "clip_ratio": 0.0, "completion_length": 112.140625, "epoch": 3.8431703204047216, "grad_norm": 14.33435507420024, "kl": 0.568359375, "learning_rate": 2.323777403035413e-07, "loss": 0.0006, "reward": 3.5199133157730103, "reward_std": 0.04245698405429721, "rewards/final_reward": 1.245931844644452, "rewards/mask_iou_reward": 0.622965922322226, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5199132561683655, "rewards/thk_ans_format_reward": 1.0, "step": 2276, "think_completion_length": 44.75 }, { "clip_ratio": 0.0, "completion_length": 172.203125, "epoch": 3.8448566610455313, "grad_norm": 5.219698782691755, "kl": 0.4921875, "learning_rate": 2.320404721753794e-07, "loss": 0.0005, "reward": 3.1729589700698853, "reward_std": 0.08763368986546993, "rewards/final_reward": 1.6196611753146797, "rewards/mask_iou_reward": 0.8098305876573398, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1729588210582733, "rewards/thk_ans_format_reward": 1.0, "step": 2277, "think_completion_length": 47.9375 }, { "clip_ratio": 0.0, "completion_length": 133.734375, "epoch": 3.8465430016863404, "grad_norm": 8.98229960876335, "kl": 0.5234375, "learning_rate": 2.3170320404721753e-07, "loss": 0.0005, "reward": 3.2349685430526733, "reward_std": 0.035814208909869194, "rewards/final_reward": 0.9417275861362087, "rewards/mask_iou_reward": 0.47086379306810433, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2349684834480286, "rewards/thk_ans_format_reward": 1.0, "step": 2278, "think_completion_length": 47.4375 }, { "clip_ratio": 0.0, "completion_length": 113.65625, "epoch": 3.84822934232715, "grad_norm": 13.066641019019265, "kl": 0.5, "learning_rate": 2.3136593591905564e-07, "loss": 0.0005, "reward": 3.577161431312561, "reward_std": 0.1769073959439993, "rewards/final_reward": 1.7993820008262598, "rewards/mask_iou_reward": 0.8996910004131299, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5771613717079163, "rewards/thk_ans_format_reward": 1.0, "step": 2279, "think_completion_length": 45.15625 }, { "clip_ratio": 0.0, "completion_length": 117.46875, "epoch": 3.8499156829679597, "grad_norm": 16.36272162499701, "kl": 0.54296875, "learning_rate": 2.3102866779089375e-07, "loss": 0.0005, "reward": 3.342397093772888, "reward_std": 0.05663332901895046, "rewards/final_reward": 1.3201730435836965, "rewards/mask_iou_reward": 0.6600865217918482, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.342397153377533, "rewards/thk_ans_format_reward": 1.0, "step": 2280, "think_completion_length": 42.6875 }, { "clip_ratio": 0.0, "completion_length": 115.953125, "epoch": 3.851602023608769, "grad_norm": 21.31976206344704, "kl": 0.564453125, "learning_rate": 2.3069139966273184e-07, "loss": 0.0006, "reward": 3.7095367908477783, "reward_std": 0.09649944491684437, "rewards/final_reward": 1.7138956461854962, "rewards/mask_iou_reward": 0.8569478230927481, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7095369100570679, "rewards/thk_ans_format_reward": 1.0, "step": 2281, "think_completion_length": 42.84375 }, { "clip_ratio": 0.0, "completion_length": 190.25, "epoch": 3.8532883642495785, "grad_norm": 26.774222606813684, "kl": 0.5556640625, "learning_rate": 2.3035413153456998e-07, "loss": 0.0006, "reward": 3.27087664604187, "reward_std": 0.27236051857471466, "rewards/final_reward": 1.299968729081332, "rewards/mask_iou_reward": 0.649984364540666, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2708768248558044, "rewards/thk_ans_format_reward": 1.0, "step": 2282, "think_completion_length": 40.59375 }, { "clip_ratio": 0.0, "completion_length": 115.6875, "epoch": 3.8549747048903877, "grad_norm": 11.647422897155579, "kl": 0.53515625, "learning_rate": 2.3001686340640807e-07, "loss": 0.0006, "reward": 3.6066473722457886, "reward_std": 0.037033793749287724, "rewards/final_reward": 1.7790633561622415, "rewards/mask_iou_reward": 0.8895316780811208, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6066473126411438, "rewards/thk_ans_format_reward": 1.0, "step": 2283, "think_completion_length": 46.25 }, { "clip_ratio": 0.0, "completion_length": 120.28125, "epoch": 3.8566610455311974, "grad_norm": 6.399696029593912, "kl": 0.59765625, "learning_rate": 2.296795952782462e-07, "loss": 0.0006, "reward": 3.1298261880874634, "reward_std": 0.135479424148798, "rewards/final_reward": 1.0958613893826303, "rewards/mask_iou_reward": 0.5479306946913152, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1298261880874634, "rewards/thk_ans_format_reward": 1.0, "step": 2284, "think_completion_length": 39.25 }, { "clip_ratio": 0.0, "completion_length": 231.890625, "epoch": 3.8583473861720066, "grad_norm": 12.53030051498087, "kl": 0.470703125, "learning_rate": 2.293423271500843e-07, "loss": 0.0005, "reward": 3.647843360900879, "reward_std": 0.18895704671740532, "rewards/final_reward": 1.4987438065983483, "rewards/mask_iou_reward": 0.7493719032991741, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6478431224822998, "rewards/thk_ans_format_reward": 1.0, "step": 2285, "think_completion_length": 43.875 }, { "clip_ratio": 0.0, "completion_length": 110.859375, "epoch": 3.860033726812816, "grad_norm": 8.084188059338498, "kl": 0.56640625, "learning_rate": 2.290050590219224e-07, "loss": 0.0006, "reward": 3.7369555234909058, "reward_std": 0.12819246295839548, "rewards/final_reward": 1.6847100480336998, "rewards/mask_iou_reward": 0.8423550240168499, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7369555830955505, "rewards/thk_ans_format_reward": 1.0, "step": 2286, "think_completion_length": 42.9375 }, { "clip_ratio": 0.0, "completion_length": 213.9375, "epoch": 3.861720067453626, "grad_norm": 13.772257072069493, "kl": 0.4736328125, "learning_rate": 2.2866779089376052e-07, "loss": 0.0005, "reward": 3.70966899394989, "reward_std": 0.07207040954381227, "rewards/final_reward": 1.501267929796344, "rewards/mask_iou_reward": 0.750633964898172, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.709669053554535, "rewards/thk_ans_format_reward": 1.0, "step": 2287, "think_completion_length": 44.40625 }, { "clip_ratio": 0.0, "completion_length": 129.515625, "epoch": 3.863406408094435, "grad_norm": 16.33337265919695, "kl": 0.537109375, "learning_rate": 2.2833052276559864e-07, "loss": 0.0005, "reward": 3.5507311820983887, "reward_std": 0.11423347145318985, "rewards/final_reward": 1.7487370427616498, "rewards/mask_iou_reward": 0.8743685213808249, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5507311820983887, "rewards/thk_ans_format_reward": 1.0, "step": 2288, "think_completion_length": 40.09375 }, { "clip_ratio": 0.0, "completion_length": 115.328125, "epoch": 3.8650927487352447, "grad_norm": 7.145956043564162, "kl": 0.541015625, "learning_rate": 2.2799325463743673e-07, "loss": 0.0005, "reward": 2.844885468482971, "reward_std": 0.35542061924934387, "rewards/final_reward": 0.911483163947697, "rewards/mask_iou_reward": 0.4557415819738485, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8448854386806488, "rewards/thk_ans_format_reward": 1.0, "step": 2289, "think_completion_length": 45.5 }, { "clip_ratio": 0.0, "completion_length": 231.953125, "epoch": 3.866779089376054, "grad_norm": 53.96931329998067, "kl": 0.546875, "learning_rate": 2.2765598650927487e-07, "loss": 0.0005, "reward": 3.043015956878662, "reward_std": 0.4434027671813965, "rewards/final_reward": 0.5968193481833238, "rewards/mask_iou_reward": 0.2984096740916619, "rewards/sam_format_reward": 0.953125, "rewards/sam_reward_func_ultra": 1.136765956878662, "rewards/thk_ans_format_reward": 0.953125, "step": 2290, "think_completion_length": 43.9375 }, { "clip_ratio": 0.0, "completion_length": 110.671875, "epoch": 3.8684654300168635, "grad_norm": 10.023266830041628, "kl": 0.63671875, "learning_rate": 2.2731871838111298e-07, "loss": 0.0006, "reward": 2.770586848258972, "reward_std": 0.19898640364408493, "rewards/final_reward": 1.2640052339226235, "rewards/mask_iou_reward": 0.6320026169613118, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7705867886543274, "rewards/thk_ans_format_reward": 1.0, "step": 2291, "think_completion_length": 35.5625 }, { "clip_ratio": 0.0, "completion_length": 112.84375, "epoch": 3.8701517706576727, "grad_norm": 15.042473029968052, "kl": 0.57421875, "learning_rate": 2.269814502529511e-07, "loss": 0.0006, "reward": 3.5436513423919678, "reward_std": 0.007338247261941433, "rewards/final_reward": 1.1669293771860252, "rewards/mask_iou_reward": 0.5834646885930126, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5436512231826782, "rewards/thk_ans_format_reward": 1.0, "step": 2292, "think_completion_length": 42.78125 }, { "clip_ratio": 0.0, "completion_length": 163.234375, "epoch": 3.8718381112984823, "grad_norm": 4.985518385493025, "kl": 0.4462890625, "learning_rate": 2.266441821247892e-07, "loss": 0.0004, "reward": 3.6346672773361206, "reward_std": 0.05469698668457568, "rewards/final_reward": 1.8549077323712804, "rewards/mask_iou_reward": 0.9274538661856402, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.634667456150055, "rewards/thk_ans_format_reward": 1.0, "step": 2293, "think_completion_length": 39.4375 }, { "clip_ratio": 0.0, "completion_length": 112.34375, "epoch": 3.873524451939292, "grad_norm": 8.984147115120505, "kl": 0.533203125, "learning_rate": 2.263069139966273e-07, "loss": 0.0005, "reward": 3.6444458961486816, "reward_std": 0.11999626411125064, "rewards/final_reward": 1.694625798370633, "rewards/mask_iou_reward": 0.8473128991853165, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6444460153579712, "rewards/thk_ans_format_reward": 1.0, "step": 2294, "think_completion_length": 40.75 }, { "clip_ratio": 0.0, "completion_length": 112.875, "epoch": 3.875210792580101, "grad_norm": 10.732266711491457, "kl": 0.53125, "learning_rate": 2.2596964586846544e-07, "loss": 0.0005, "reward": 3.298237681388855, "reward_std": 0.07919766753911972, "rewards/final_reward": 1.287060814014629, "rewards/mask_iou_reward": 0.6435304070073145, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2982377409934998, "rewards/thk_ans_format_reward": 1.0, "step": 2295, "think_completion_length": 41.78125 }, { "clip_ratio": 0.0, "completion_length": 130.421875, "epoch": 3.876897133220911, "grad_norm": 17.229923944564966, "kl": 0.552734375, "learning_rate": 2.2563237774030352e-07, "loss": 0.0006, "reward": 3.177481770515442, "reward_std": 0.33257442712783813, "rewards/final_reward": 1.1130151611958692, "rewards/mask_iou_reward": 0.5565075805979346, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1774817407131195, "rewards/thk_ans_format_reward": 1.0, "step": 2296, "think_completion_length": 37.4375 }, { "clip_ratio": 0.0, "completion_length": 133.515625, "epoch": 3.87858347386172, "grad_norm": 7.660805708045417, "kl": 0.515625, "learning_rate": 2.2529510961214166e-07, "loss": 0.0005, "reward": 3.45150625705719, "reward_std": 0.13643109984695911, "rewards/final_reward": 1.5186618102721703, "rewards/mask_iou_reward": 0.7593309051360851, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4515064358711243, "rewards/thk_ans_format_reward": 1.0, "step": 2297, "think_completion_length": 42.34375 }, { "clip_ratio": 0.0, "completion_length": 117.0625, "epoch": 3.8802698145025296, "grad_norm": 9.538622827190537, "kl": 0.53515625, "learning_rate": 2.2495784148397975e-07, "loss": 0.0005, "reward": 3.696483016014099, "reward_std": 0.07705111056566238, "rewards/final_reward": 1.8266000804889195, "rewards/mask_iou_reward": 0.9133000402444598, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6964829564094543, "rewards/thk_ans_format_reward": 1.0, "step": 2298, "think_completion_length": 41.40625 }, { "clip_ratio": 0.0, "completion_length": 133.203125, "epoch": 3.881956155143339, "grad_norm": 61.88683587213132, "kl": 0.4921875, "learning_rate": 2.2462057335581786e-07, "loss": 0.0005, "reward": 3.261089324951172, "reward_std": 0.3139321506023407, "rewards/final_reward": 1.5563992586356612, "rewards/mask_iou_reward": 0.7781996293178306, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2610894441604614, "rewards/thk_ans_format_reward": 1.0, "step": 2299, "think_completion_length": 43.59375 }, { "clip_ratio": 0.0, "completion_length": 109.25, "epoch": 3.8836424957841484, "grad_norm": 14.236252838419876, "kl": 0.5625, "learning_rate": 2.2428330522765598e-07, "loss": 0.0006, "reward": 3.439433217048645, "reward_std": 0.03572419285774231, "rewards/final_reward": 1.9520528977990645, "rewards/mask_iou_reward": 0.9760264488995323, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4394332766532898, "rewards/thk_ans_format_reward": 1.0, "step": 2300, "think_completion_length": 38.875 }, { "clip_ratio": 0.0, "completion_length": 172.296875, "epoch": 3.885328836424958, "grad_norm": 8.491205443011415, "kl": 0.521484375, "learning_rate": 2.239460370994941e-07, "loss": 0.0005, "reward": 3.5966683626174927, "reward_std": 0.05547321029007435, "rewards/final_reward": 1.6187499078921728, "rewards/mask_iou_reward": 0.8093749539460864, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.596668303012848, "rewards/thk_ans_format_reward": 1.0, "step": 2301, "think_completion_length": 40.1875 }, { "clip_ratio": 0.0, "completion_length": 117.625, "epoch": 3.8870151770657673, "grad_norm": 9.93154426736727, "kl": 0.529296875, "learning_rate": 2.236087689713322e-07, "loss": 0.0005, "reward": 3.212773323059082, "reward_std": 0.04713407810777426, "rewards/final_reward": 1.1338621025049003, "rewards/mask_iou_reward": 0.5669310512524501, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2127732634544373, "rewards/thk_ans_format_reward": 1.0, "step": 2302, "think_completion_length": 45.03125 }, { "clip_ratio": 0.0, "completion_length": 110.125, "epoch": 3.8887015177065765, "grad_norm": 7.367578602723742, "kl": 0.560546875, "learning_rate": 2.2327150084317032e-07, "loss": 0.0006, "reward": 3.3229422569274902, "reward_std": 0.18621986359357834, "rewards/final_reward": 1.3275170722478604, "rewards/mask_iou_reward": 0.6637585361239302, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.322942316532135, "rewards/thk_ans_format_reward": 1.0, "step": 2303, "think_completion_length": 42.09375 }, { "clip_ratio": 0.0, "completion_length": 140.3125, "epoch": 3.890387858347386, "grad_norm": 6.136457414582041, "kl": 0.5322265625, "learning_rate": 2.229342327150084e-07, "loss": 0.0005, "reward": 3.64126718044281, "reward_std": 0.048449140042066574, "rewards/final_reward": 1.8254532397981125, "rewards/mask_iou_reward": 0.9127266198990562, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6412672400474548, "rewards/thk_ans_format_reward": 1.0, "step": 2304, "think_completion_length": 39.09375 }, { "clip_ratio": 0.0, "completion_length": 123.5625, "epoch": 3.8920741989881957, "grad_norm": 33.440203072134736, "kl": 0.556640625, "learning_rate": 2.2259696458684655e-07, "loss": 0.0006, "reward": 3.7791318893432617, "reward_std": 0.054630931466817856, "rewards/final_reward": 1.8482051781942652, "rewards/mask_iou_reward": 0.9241025890971326, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7791318893432617, "rewards/thk_ans_format_reward": 1.0, "step": 2305, "think_completion_length": 43.75 }, { "clip_ratio": 0.0, "completion_length": 128.890625, "epoch": 3.893760539629005, "grad_norm": 43.10138167582071, "kl": 0.5234375, "learning_rate": 2.2225969645868464e-07, "loss": 0.0005, "reward": 3.2736620903015137, "reward_std": 0.4450536370277405, "rewards/final_reward": 1.3462738375611605, "rewards/mask_iou_reward": 0.6731369187805802, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.2892871499061584, "rewards/thk_ans_format_reward": 1.0, "step": 2306, "think_completion_length": 48.125 }, { "clip_ratio": 0.0, "completion_length": 115.4375, "epoch": 3.8954468802698146, "grad_norm": 8.890895899704827, "kl": 0.529296875, "learning_rate": 2.2192242833052275e-07, "loss": 0.0005, "reward": 3.341302990913391, "reward_std": 0.10977509245276451, "rewards/final_reward": 1.0956768907996606, "rewards/mask_iou_reward": 0.5478384453998303, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3413029909133911, "rewards/thk_ans_format_reward": 1.0, "step": 2307, "think_completion_length": 44.5 }, { "clip_ratio": 0.0, "completion_length": 138.546875, "epoch": 3.897133220910624, "grad_norm": 4.020982898499106, "kl": 0.4716796875, "learning_rate": 2.2158516020236086e-07, "loss": 0.0004, "reward": 3.4928349256515503, "reward_std": 0.053374568466097116, "rewards/final_reward": 1.880111202432228, "rewards/mask_iou_reward": 0.940055601216114, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4928348660469055, "rewards/thk_ans_format_reward": 1.0, "step": 2308, "think_completion_length": 39.96875 }, { "clip_ratio": 0.0, "completion_length": 112.734375, "epoch": 3.8988195615514334, "grad_norm": 8.190335018830517, "kl": 0.60546875, "learning_rate": 2.2124789207419898e-07, "loss": 0.0006, "reward": 3.276050329208374, "reward_std": 0.14282017201185226, "rewards/final_reward": 1.581921026116359, "rewards/mask_iou_reward": 0.7909605130581795, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2760505676269531, "rewards/thk_ans_format_reward": 1.0, "step": 2309, "think_completion_length": 41.0625 }, { "clip_ratio": 0.0, "completion_length": 184.828125, "epoch": 3.9005059021922426, "grad_norm": 12.560647227806385, "kl": 0.48046875, "learning_rate": 2.209106239460371e-07, "loss": 0.0005, "reward": 3.7485271692276, "reward_std": 0.062254197895526886, "rewards/final_reward": 1.8556166528903737, "rewards/mask_iou_reward": 0.9278083264451868, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7485272288322449, "rewards/thk_ans_format_reward": 1.0, "step": 2310, "think_completion_length": 40.21875 }, { "clip_ratio": 0.0, "completion_length": 115.34375, "epoch": 3.902192242833052, "grad_norm": 11.193565133332825, "kl": 0.583984375, "learning_rate": 2.205733558178752e-07, "loss": 0.0006, "reward": 3.7668726444244385, "reward_std": 0.020487097091972828, "rewards/final_reward": 1.5766710877741281, "rewards/mask_iou_reward": 0.7883355438870641, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7668728232383728, "rewards/thk_ans_format_reward": 1.0, "step": 2311, "think_completion_length": 45.9375 }, { "clip_ratio": 0.0, "completion_length": 163.609375, "epoch": 3.903878583473862, "grad_norm": 16.8454073316175, "kl": 0.5361328125, "learning_rate": 2.202360876897133e-07, "loss": 0.0005, "reward": 2.701531767845154, "reward_std": 0.2758069708943367, "rewards/final_reward": 0.5598322417967414, "rewards/mask_iou_reward": 0.2799161208983707, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 0.732781708240509, "rewards/thk_ans_format_reward": 0.984375, "step": 2312, "think_completion_length": 41.28125 }, { "clip_ratio": 0.0, "completion_length": 113.734375, "epoch": 3.905564924114671, "grad_norm": 6.042807399278246, "kl": 0.533203125, "learning_rate": 2.1989881956155143e-07, "loss": 0.0005, "reward": 3.065964460372925, "reward_std": 0.05612972844392061, "rewards/final_reward": 1.332514875414373, "rewards/mask_iou_reward": 0.6662574377071865, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0659645199775696, "rewards/thk_ans_format_reward": 1.0, "step": 2313, "think_completion_length": 44.125 }, { "clip_ratio": 0.0, "completion_length": 115.09375, "epoch": 3.9072512647554807, "grad_norm": 20.914529546989716, "kl": 0.6171875, "learning_rate": 2.1956155143338952e-07, "loss": 0.0006, "reward": 3.776341199874878, "reward_std": 0.06756392121315002, "rewards/final_reward": 1.6560440449945635, "rewards/mask_iou_reward": 0.8280220224972817, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7763413190841675, "rewards/thk_ans_format_reward": 1.0, "step": 2314, "think_completion_length": 51.1875 }, { "clip_ratio": 0.0, "completion_length": 111.953125, "epoch": 3.9089376053962903, "grad_norm": 7.652094886113456, "kl": 0.62109375, "learning_rate": 2.1922428330522766e-07, "loss": 0.0007, "reward": 3.4693918228149414, "reward_std": 0.07247776072472334, "rewards/final_reward": 1.3367317342178633, "rewards/mask_iou_reward": 0.6683658671089316, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4693917036056519, "rewards/thk_ans_format_reward": 1.0, "step": 2315, "think_completion_length": 40.625 }, { "clip_ratio": 0.0, "completion_length": 112.65625, "epoch": 3.9106239460370995, "grad_norm": 16.51306856454786, "kl": 0.4873046875, "learning_rate": 2.1888701517706575e-07, "loss": 0.0005, "reward": 3.5613391399383545, "reward_std": 0.20852696895599365, "rewards/final_reward": 1.490526290384657, "rewards/mask_iou_reward": 0.7452631451923285, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5613394379615784, "rewards/thk_ans_format_reward": 1.0, "step": 2316, "think_completion_length": 42.65625 }, { "clip_ratio": 0.0, "completion_length": 113.65625, "epoch": 3.9123102866779087, "grad_norm": 13.014352060898924, "kl": 0.58203125, "learning_rate": 2.1854974704890386e-07, "loss": 0.0006, "reward": 3.795401930809021, "reward_std": 0.07416247483342886, "rewards/final_reward": 1.679278486730068, "rewards/mask_iou_reward": 0.839639243365034, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7954018115997314, "rewards/thk_ans_format_reward": 1.0, "step": 2317, "think_completion_length": 46.6875 }, { "clip_ratio": 0.0, "completion_length": 166.453125, "epoch": 3.9139966273187183, "grad_norm": 8.64054859551105, "kl": 0.4921875, "learning_rate": 2.1821247892074197e-07, "loss": 0.0005, "reward": 3.2304306030273438, "reward_std": 0.41698751598596573, "rewards/final_reward": 1.578626626927155, "rewards/mask_iou_reward": 0.7893133134635775, "rewards/sam_format_reward": 0.953125, "rewards/sam_reward_func_ultra": 1.324180543422699, "rewards/thk_ans_format_reward": 0.953125, "step": 2318, "think_completion_length": 40.53125 }, { "clip_ratio": 0.0, "completion_length": 110.96875, "epoch": 3.915682967959528, "grad_norm": 15.939587317660832, "kl": 0.611328125, "learning_rate": 2.178752107925801e-07, "loss": 0.0006, "reward": 3.709532141685486, "reward_std": 0.0666387677192688, "rewards/final_reward": 1.6968465215991122, "rewards/mask_iou_reward": 0.8484232607995561, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7095322012901306, "rewards/thk_ans_format_reward": 1.0, "step": 2319, "think_completion_length": 46.28125 }, { "clip_ratio": 0.0, "completion_length": 129.71875, "epoch": 3.917369308600337, "grad_norm": 5.404933794280482, "kl": 0.650390625, "learning_rate": 2.1753794266441818e-07, "loss": 0.0007, "reward": 3.586440086364746, "reward_std": 0.16442099958658218, "rewards/final_reward": 1.5678774901850385, "rewards/mask_iou_reward": 0.7839387450925193, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5864400267601013, "rewards/thk_ans_format_reward": 1.0, "step": 2320, "think_completion_length": 42.8125 }, { "clip_ratio": 0.0, "completion_length": 130.25, "epoch": 3.919055649241147, "grad_norm": 13.873493009868913, "kl": 0.509765625, "learning_rate": 2.1720067453625632e-07, "loss": 0.0005, "reward": 3.51028573513031, "reward_std": 0.04750672448426485, "rewards/final_reward": 1.8690834462785189, "rewards/mask_iou_reward": 0.9345417231392594, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.510285496711731, "rewards/thk_ans_format_reward": 1.0, "step": 2321, "think_completion_length": 49.875 }, { "clip_ratio": 0.0, "completion_length": 237.0625, "epoch": 3.920741989881956, "grad_norm": 7.491766455383782, "kl": 0.48046875, "learning_rate": 2.1686340640809443e-07, "loss": 0.0005, "reward": 2.8261446952819824, "reward_std": 0.4372350126504898, "rewards/final_reward": 0.760745959984364, "rewards/mask_iou_reward": 0.380372979992182, "rewards/sam_format_reward": 0.9375, "rewards/sam_reward_func_ultra": 0.9355196356773376, "rewards/thk_ans_format_reward": 0.953125, "step": 2322, "think_completion_length": 51.5625 }, { "clip_ratio": 0.0, "completion_length": 113.890625, "epoch": 3.9224283305227656, "grad_norm": 8.40222217416887, "kl": 0.53515625, "learning_rate": 2.1652613827993254e-07, "loss": 0.0005, "reward": 3.835439443588257, "reward_std": 0.06295907869935036, "rewards/final_reward": 1.7956223026772338, "rewards/mask_iou_reward": 0.8978111513386169, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8354395031929016, "rewards/thk_ans_format_reward": 1.0, "step": 2323, "think_completion_length": 40.375 }, { "clip_ratio": 0.0, "completion_length": 158.890625, "epoch": 3.924114671163575, "grad_norm": 4.534313067244203, "kl": 0.548828125, "learning_rate": 2.1618887015177066e-07, "loss": 0.0005, "reward": 3.424034357070923, "reward_std": 0.10503194469492882, "rewards/final_reward": 0.9665427966019128, "rewards/mask_iou_reward": 0.4832713983009564, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4240343272686005, "rewards/thk_ans_format_reward": 1.0, "step": 2324, "think_completion_length": 39.28125 }, { "clip_ratio": 0.0, "completion_length": 126.96875, "epoch": 3.9258010118043845, "grad_norm": 5.49761755891851, "kl": 0.5283203125, "learning_rate": 2.1585160202360875e-07, "loss": 0.0005, "reward": 3.173019528388977, "reward_std": 0.19381612539291382, "rewards/final_reward": 1.1044718489811443, "rewards/mask_iou_reward": 0.5522359244905721, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1730196475982666, "rewards/thk_ans_format_reward": 1.0, "step": 2325, "think_completion_length": 45.28125 }, { "clip_ratio": 0.0, "completion_length": 132.90625, "epoch": 3.927487352445194, "grad_norm": 10.577534335724108, "kl": 0.5390625, "learning_rate": 2.1551433389544689e-07, "loss": 0.0005, "reward": 3.248093843460083, "reward_std": 0.06915931031107903, "rewards/final_reward": 1.10379462136804, "rewards/mask_iou_reward": 0.55189731068402, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2480938136577606, "rewards/thk_ans_format_reward": 1.0, "step": 2326, "think_completion_length": 40.4375 }, { "clip_ratio": 0.0, "completion_length": 111.84375, "epoch": 3.9291736930860033, "grad_norm": 17.10393035194125, "kl": 0.5703125, "learning_rate": 2.1517706576728497e-07, "loss": 0.0006, "reward": 3.4472368955612183, "reward_std": 0.23830869793891907, "rewards/final_reward": 1.4801394408547965, "rewards/mask_iou_reward": 0.7400697204273983, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.447236955165863, "rewards/thk_ans_format_reward": 1.0, "step": 2327, "think_completion_length": 42.65625 }, { "clip_ratio": 0.0, "completion_length": 147.0, "epoch": 3.930860033726813, "grad_norm": 26.239557118806275, "kl": 0.517578125, "learning_rate": 2.1483979763912311e-07, "loss": 0.0005, "reward": 3.143664836883545, "reward_std": 0.11279793456196785, "rewards/final_reward": 1.392644433981519, "rewards/mask_iou_reward": 0.6963222169907595, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.143664836883545, "rewards/thk_ans_format_reward": 1.0, "step": 2328, "think_completion_length": 41.65625 }, { "clip_ratio": 0.0, "completion_length": 157.71875, "epoch": 3.932546374367622, "grad_norm": 7.215896642060539, "kl": 0.501953125, "learning_rate": 2.145025295109612e-07, "loss": 0.0005, "reward": 3.80619215965271, "reward_std": 0.05122208781540394, "rewards/final_reward": 1.8444367542741427, "rewards/mask_iou_reward": 0.9222183771370713, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8061923384666443, "rewards/thk_ans_format_reward": 1.0, "step": 2329, "think_completion_length": 45.8125 }, { "clip_ratio": 0.0, "completion_length": 127.15625, "epoch": 3.9342327150084317, "grad_norm": 139.07716888413185, "kl": 0.5107421875, "learning_rate": 2.1416526138279931e-07, "loss": 0.0005, "reward": 3.829445242881775, "reward_std": 0.03507534274831414, "rewards/final_reward": 1.7253977980439543, "rewards/mask_iou_reward": 0.8626988990219772, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8294451236724854, "rewards/thk_ans_format_reward": 1.0, "step": 2330, "think_completion_length": 45.03125 }, { "clip_ratio": 0.0, "completion_length": 161.890625, "epoch": 3.935919055649241, "grad_norm": 10.157885839120912, "kl": 0.568359375, "learning_rate": 2.1382799325463743e-07, "loss": 0.0006, "reward": 3.1242516040802, "reward_std": 0.2557784169912338, "rewards/final_reward": 0.6962164457702434, "rewards/mask_iou_reward": 0.3481082228851217, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1242516934871674, "rewards/thk_ans_format_reward": 1.0, "step": 2331, "think_completion_length": 49.34375 }, { "clip_ratio": 0.0, "completion_length": 113.3125, "epoch": 3.9376053962900506, "grad_norm": 15.872231262992985, "kl": 0.61328125, "learning_rate": 2.1349072512647554e-07, "loss": 0.0006, "reward": 3.497292399406433, "reward_std": 0.04093513707630336, "rewards/final_reward": 1.8676119399339668, "rewards/mask_iou_reward": 0.9338059699669834, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.497292399406433, "rewards/thk_ans_format_reward": 1.0, "step": 2332, "think_completion_length": 42.125 }, { "clip_ratio": 0.0, "completion_length": 113.046875, "epoch": 3.93929173693086, "grad_norm": 18.09617095066498, "kl": 0.603515625, "learning_rate": 2.1315345699831366e-07, "loss": 0.0006, "reward": 3.1687710285186768, "reward_std": 0.0905944537371397, "rewards/final_reward": 1.2206195223726, "rewards/mask_iou_reward": 0.6103097611863, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1687710881233215, "rewards/thk_ans_format_reward": 1.0, "step": 2333, "think_completion_length": 45.1875 }, { "clip_ratio": 0.0, "completion_length": 117.984375, "epoch": 3.9409780775716694, "grad_norm": 6.712108613719201, "kl": 0.52734375, "learning_rate": 2.1281618887015177e-07, "loss": 0.0005, "reward": 3.137063980102539, "reward_std": 0.03329486772418022, "rewards/final_reward": 1.110083896969302, "rewards/mask_iou_reward": 0.555041948484651, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1370639204978943, "rewards/thk_ans_format_reward": 1.0, "step": 2334, "think_completion_length": 42.96875 }, { "clip_ratio": 0.0, "completion_length": 126.65625, "epoch": 3.942664418212479, "grad_norm": 30.11300081634084, "kl": 0.48828125, "learning_rate": 2.1247892074198986e-07, "loss": 0.0004, "reward": 3.1940513849258423, "reward_std": 0.3399582654237747, "rewards/final_reward": 1.6859626967085002, "rewards/mask_iou_reward": 0.8429813483542501, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1940513253211975, "rewards/thk_ans_format_reward": 1.0, "step": 2335, "think_completion_length": 45.4375 }, { "clip_ratio": 0.0, "completion_length": 117.109375, "epoch": 3.9443507588532882, "grad_norm": 15.486041233379309, "kl": 0.55859375, "learning_rate": 2.12141652613828e-07, "loss": 0.0006, "reward": 3.3924100399017334, "reward_std": 0.06121925637125969, "rewards/final_reward": 1.5387883976333216, "rewards/mask_iou_reward": 0.7693941988166608, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3924100399017334, "rewards/thk_ans_format_reward": 1.0, "step": 2336, "think_completion_length": 52.59375 }, { "clip_ratio": 0.0, "completion_length": 112.53125, "epoch": 3.946037099494098, "grad_norm": 24.18681337716054, "kl": 0.5703125, "learning_rate": 2.1180438448566609e-07, "loss": 0.0006, "reward": 2.8781609535217285, "reward_std": 0.08533753454685211, "rewards/final_reward": 1.4222425338746256, "rewards/mask_iou_reward": 0.7111212669373128, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8781610429286957, "rewards/thk_ans_format_reward": 1.0, "step": 2337, "think_completion_length": 41.375 }, { "clip_ratio": 0.0, "completion_length": 125.765625, "epoch": 3.947723440134907, "grad_norm": 13.800023998033083, "kl": 0.79296875, "learning_rate": 2.114671163575042e-07, "loss": 0.0008, "reward": 3.460109233856201, "reward_std": 0.06012635678052902, "rewards/final_reward": 1.5778669843797775, "rewards/mask_iou_reward": 0.7889334921898887, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.460109293460846, "rewards/thk_ans_format_reward": 1.0, "step": 2338, "think_completion_length": 47.59375 }, { "clip_ratio": 0.0, "completion_length": 117.265625, "epoch": 3.9494097807757167, "grad_norm": 7.30648588552527, "kl": 0.58984375, "learning_rate": 2.111298482293423e-07, "loss": 0.0006, "reward": 3.709952473640442, "reward_std": 0.01341787725687027, "rewards/final_reward": 1.9771459305174437, "rewards/mask_iou_reward": 0.9885729652587218, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7099525332450867, "rewards/thk_ans_format_reward": 1.0, "step": 2339, "think_completion_length": 45.75 }, { "clip_ratio": 0.0, "completion_length": 113.03125, "epoch": 3.9510961214165263, "grad_norm": 10.580802313832068, "kl": 0.70703125, "learning_rate": 2.1079258010118043e-07, "loss": 0.0007, "reward": 3.5735737085342407, "reward_std": 0.05377620831131935, "rewards/final_reward": 1.7095786501825816, "rewards/mask_iou_reward": 0.8547893250912908, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5735737085342407, "rewards/thk_ans_format_reward": 1.0, "step": 2340, "think_completion_length": 45.65625 }, { "clip_ratio": 0.0, "completion_length": 166.703125, "epoch": 3.9527824620573355, "grad_norm": 5.796140618878087, "kl": 0.453125, "learning_rate": 2.1045531197301854e-07, "loss": 0.0003, "reward": 2.8115952014923096, "reward_std": 0.059875136241316795, "rewards/final_reward": 1.033261149128388, "rewards/mask_iou_reward": 0.516630574564194, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.811595231294632, "rewards/thk_ans_format_reward": 1.0, "step": 2341, "think_completion_length": 44.65625 }, { "clip_ratio": 0.0, "completion_length": 120.609375, "epoch": 3.954468802698145, "grad_norm": 7.097882097442878, "kl": 0.541015625, "learning_rate": 2.1011804384485665e-07, "loss": 0.0005, "reward": 3.1771254539489746, "reward_std": 0.14247756265103817, "rewards/final_reward": 0.6251058696369434, "rewards/mask_iou_reward": 0.3125529348184717, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1771255135536194, "rewards/thk_ans_format_reward": 1.0, "step": 2342, "think_completion_length": 51.125 }, { "clip_ratio": 0.0, "completion_length": 115.484375, "epoch": 3.9561551433389543, "grad_norm": 9.763072265442423, "kl": 0.556640625, "learning_rate": 2.0978077571669474e-07, "loss": 0.0006, "reward": 3.623378276824951, "reward_std": 0.015420469455420971, "rewards/final_reward": 1.4593983815876321, "rewards/mask_iou_reward": 0.7296991907938161, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6233782768249512, "rewards/thk_ans_format_reward": 1.0, "step": 2343, "think_completion_length": 48.8125 }, { "clip_ratio": 0.0, "completion_length": 114.734375, "epoch": 3.957841483979764, "grad_norm": 12.010130402601112, "kl": 0.560546875, "learning_rate": 2.0944350758853288e-07, "loss": 0.0006, "reward": 3.293154716491699, "reward_std": 0.1442108228802681, "rewards/final_reward": 1.1585732875079058, "rewards/mask_iou_reward": 0.5792866437539529, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2931545972824097, "rewards/thk_ans_format_reward": 1.0, "step": 2344, "think_completion_length": 46.21875 }, { "clip_ratio": 0.0, "completion_length": 116.3125, "epoch": 3.959527824620573, "grad_norm": 19.952532610141517, "kl": 0.55078125, "learning_rate": 2.0910623946037097e-07, "loss": 0.0005, "reward": 3.4306150674819946, "reward_std": 0.19564368575811386, "rewards/final_reward": 1.5140829556181865, "rewards/mask_iou_reward": 0.7570414778090933, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4306151866912842, "rewards/thk_ans_format_reward": 1.0, "step": 2345, "think_completion_length": 45.625 }, { "clip_ratio": 0.0, "completion_length": 112.90625, "epoch": 3.961214165261383, "grad_norm": 6.506383412589879, "kl": 0.615234375, "learning_rate": 2.087689713322091e-07, "loss": 0.0006, "reward": 3.2742691040039062, "reward_std": 0.38428041338920593, "rewards/final_reward": 1.3280366253202556, "rewards/mask_iou_reward": 0.6640183126601278, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2742691040039062, "rewards/thk_ans_format_reward": 1.0, "step": 2346, "think_completion_length": 44.4375 }, { "clip_ratio": 0.0, "completion_length": 116.75, "epoch": 3.9629005059021924, "grad_norm": 12.84224251522735, "kl": 0.625, "learning_rate": 2.084317032040472e-07, "loss": 0.0006, "reward": 3.4609274864196777, "reward_std": 0.07086838409304619, "rewards/final_reward": 1.5369746382770562, "rewards/mask_iou_reward": 0.7684873191385281, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4609274864196777, "rewards/thk_ans_format_reward": 1.0, "step": 2347, "think_completion_length": 42.3125 }, { "clip_ratio": 0.0, "completion_length": 114.171875, "epoch": 3.9645868465430016, "grad_norm": 7.010148238756001, "kl": 0.541015625, "learning_rate": 2.080944350758853e-07, "loss": 0.0005, "reward": 3.171350598335266, "reward_std": 0.22759989090263844, "rewards/final_reward": 0.6685336459687008, "rewards/mask_iou_reward": 0.3342668229843504, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.171350359916687, "rewards/thk_ans_format_reward": 1.0, "step": 2348, "think_completion_length": 45.5 }, { "clip_ratio": 0.0, "completion_length": 115.40625, "epoch": 3.9662731871838113, "grad_norm": 8.214608147697609, "kl": 0.548828125, "learning_rate": 2.0775716694772345e-07, "loss": 0.0005, "reward": 3.204972267150879, "reward_std": 0.11227181181311607, "rewards/final_reward": 0.797163704136042, "rewards/mask_iou_reward": 0.398581852068021, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2049723267555237, "rewards/thk_ans_format_reward": 1.0, "step": 2349, "think_completion_length": 41.8125 }, { "clip_ratio": 0.0, "completion_length": 120.03125, "epoch": 3.9679595278246205, "grad_norm": 21.59843072432012, "kl": 0.58984375, "learning_rate": 2.0741989881956154e-07, "loss": 0.0006, "reward": 3.084683656692505, "reward_std": 0.12124911695718765, "rewards/final_reward": 1.2331499388594027, "rewards/mask_iou_reward": 0.6165749694297014, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.100308820605278, "rewards/thk_ans_format_reward": 0.984375, "step": 2350, "think_completion_length": 49.5625 }, { "clip_ratio": 0.0, "completion_length": 116.1875, "epoch": 3.96964586846543, "grad_norm": 33.7028917585473, "kl": 0.55859375, "learning_rate": 2.0708263069139965e-07, "loss": 0.0005, "reward": 3.4958006143569946, "reward_std": 0.12888287706300616, "rewards/final_reward": 1.1570881066591203, "rewards/mask_iou_reward": 0.5785440533295602, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4958006739616394, "rewards/thk_ans_format_reward": 1.0, "step": 2351, "think_completion_length": 45.78125 }, { "clip_ratio": 0.0, "completion_length": 222.34375, "epoch": 3.9713322091062393, "grad_norm": 16.717874723781456, "kl": 0.5, "learning_rate": 2.0674536256323777e-07, "loss": 0.0005, "reward": 3.160848021507263, "reward_std": 0.3593662567436695, "rewards/final_reward": 1.1463538950898486, "rewards/mask_iou_reward": 0.5731769475449243, "rewards/sam_format_reward": 0.953125, "rewards/sam_reward_func_ultra": 1.254598081111908, "rewards/thk_ans_format_reward": 0.953125, "step": 2352, "think_completion_length": 53.125 }, { "clip_ratio": 0.0, "completion_length": 140.109375, "epoch": 3.973018549747049, "grad_norm": 9.90686934332323, "kl": 0.53515625, "learning_rate": 2.0640809443507588e-07, "loss": 0.0005, "reward": 3.0223071575164795, "reward_std": 0.05929320678114891, "rewards/final_reward": 1.237988610147628, "rewards/mask_iou_reward": 0.618994305073814, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0223073363304138, "rewards/thk_ans_format_reward": 1.0, "step": 2353, "think_completion_length": 51.46875 }, { "clip_ratio": 0.0, "completion_length": 115.171875, "epoch": 3.9747048903878586, "grad_norm": 7.787093798034208, "kl": 0.578125, "learning_rate": 2.06070826306914e-07, "loss": 0.0006, "reward": 3.5822603702545166, "reward_std": 0.060061621479690075, "rewards/final_reward": 1.4014618811568327, "rewards/mask_iou_reward": 0.7007309405784163, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5822604298591614, "rewards/thk_ans_format_reward": 1.0, "step": 2354, "think_completion_length": 47.0625 }, { "clip_ratio": 0.0, "completion_length": 178.734375, "epoch": 3.9763912310286678, "grad_norm": 36.662165992532785, "kl": 0.580078125, "learning_rate": 2.057335581787521e-07, "loss": 0.0006, "reward": 2.6705543994903564, "reward_std": 0.11838686466217041, "rewards/final_reward": 0.5942330853210306, "rewards/mask_iou_reward": 0.2971165426605153, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.6705543994903564, "rewards/thk_ans_format_reward": 1.0, "step": 2355, "think_completion_length": 41.5625 }, { "clip_ratio": 0.0, "completion_length": 116.46875, "epoch": 3.9780775716694774, "grad_norm": 5.048215872579047, "kl": 0.53125, "learning_rate": 2.053962900505902e-07, "loss": 0.0005, "reward": 3.039967894554138, "reward_std": 0.16621370613574982, "rewards/final_reward": 0.9111258281079331, "rewards/mask_iou_reward": 0.45556291405396654, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0399678200483322, "rewards/thk_ans_format_reward": 1.0, "step": 2356, "think_completion_length": 47.59375 }, { "clip_ratio": 0.0, "completion_length": 120.296875, "epoch": 3.9797639123102866, "grad_norm": 5.680459626682978, "kl": 0.552734375, "learning_rate": 2.0505902192242834e-07, "loss": 0.0006, "reward": 3.610256791114807, "reward_std": 0.15282126516103745, "rewards/final_reward": 1.2514259912562973, "rewards/mask_iou_reward": 0.6257129956281486, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6102567911148071, "rewards/thk_ans_format_reward": 1.0, "step": 2357, "think_completion_length": 41.1875 }, { "clip_ratio": 0.0, "completion_length": 149.640625, "epoch": 3.9814502529510962, "grad_norm": 50.9202273375314, "kl": 0.490234375, "learning_rate": 2.0472175379426642e-07, "loss": 0.0005, "reward": 3.60845947265625, "reward_std": 0.14291292056441307, "rewards/final_reward": 1.3316400335536596, "rewards/mask_iou_reward": 0.6658200167768298, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6084596514701843, "rewards/thk_ans_format_reward": 1.0, "step": 2358, "think_completion_length": 41.90625 }, { "clip_ratio": 0.0, "completion_length": 113.625, "epoch": 3.9831365935919054, "grad_norm": 6.8048835926311355, "kl": 0.5703125, "learning_rate": 2.0438448566610456e-07, "loss": 0.0006, "reward": 3.695403814315796, "reward_std": 0.05196426110342145, "rewards/final_reward": 1.4792172508409012, "rewards/mask_iou_reward": 0.7396086254204506, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6954036355018616, "rewards/thk_ans_format_reward": 1.0, "step": 2359, "think_completion_length": 45.0625 }, { "clip_ratio": 0.0, "completion_length": 157.640625, "epoch": 3.984822934232715, "grad_norm": 7.436063431534893, "kl": 0.623046875, "learning_rate": 2.0404721753794265e-07, "loss": 0.0006, "reward": 3.3029123544692993, "reward_std": 0.19523370638489723, "rewards/final_reward": 1.8108300410136204, "rewards/mask_iou_reward": 0.9054150205068102, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3029123842716217, "rewards/thk_ans_format_reward": 1.0, "step": 2360, "think_completion_length": 46.34375 }, { "clip_ratio": 0.0, "completion_length": 110.71875, "epoch": 3.9865092748735247, "grad_norm": 11.427629433570743, "kl": 0.6171875, "learning_rate": 2.0370994940978076e-07, "loss": 0.0006, "reward": 3.600027322769165, "reward_std": 0.035907904617488384, "rewards/final_reward": 1.8624177284520789, "rewards/mask_iou_reward": 0.9312088642260394, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.600027322769165, "rewards/thk_ans_format_reward": 1.0, "step": 2361, "think_completion_length": 44.28125 }, { "clip_ratio": 0.0, "completion_length": 112.015625, "epoch": 3.988195615514334, "grad_norm": 6.780449514230903, "kl": 0.640625, "learning_rate": 2.0337268128161888e-07, "loss": 0.0006, "reward": 3.4493885040283203, "reward_std": 0.05858028307557106, "rewards/final_reward": 1.458034595134662, "rewards/mask_iou_reward": 0.729017297567331, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4493885040283203, "rewards/thk_ans_format_reward": 1.0, "step": 2362, "think_completion_length": 42.53125 }, { "clip_ratio": 0.0, "completion_length": 115.6875, "epoch": 3.989881956155143, "grad_norm": 8.232895015262699, "kl": 0.58203125, "learning_rate": 2.03035413153457e-07, "loss": 0.0006, "reward": 3.7412610054016113, "reward_std": 0.07890792051330209, "rewards/final_reward": 1.6300552338040415, "rewards/mask_iou_reward": 0.8150276169020207, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7412610054016113, "rewards/thk_ans_format_reward": 1.0, "step": 2363, "think_completion_length": 45.125 }, { "clip_ratio": 0.0, "completion_length": 111.28125, "epoch": 3.9915682967959527, "grad_norm": 13.791693880711723, "kl": 0.6015625, "learning_rate": 2.026981450252951e-07, "loss": 0.0006, "reward": 3.418661952018738, "reward_std": 0.0330036785453558, "rewards/final_reward": 1.1440329602519095, "rewards/mask_iou_reward": 0.5720164801259547, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4186618328094482, "rewards/thk_ans_format_reward": 1.0, "step": 2364, "think_completion_length": 39.71875 }, { "clip_ratio": 0.0, "completion_length": 114.0, "epoch": 3.9932546374367623, "grad_norm": 21.579353241651173, "kl": 0.6796875, "learning_rate": 2.0236087689713322e-07, "loss": 0.0006, "reward": 3.858848810195923, "reward_std": 0.016931952442973852, "rewards/final_reward": 1.8235267805660484, "rewards/mask_iou_reward": 0.9117633902830242, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8588489890098572, "rewards/thk_ans_format_reward": 1.0, "step": 2365, "think_completion_length": 44.96875 }, { "clip_ratio": 0.0, "completion_length": 151.375, "epoch": 3.9949409780775715, "grad_norm": 11.854854255019934, "kl": 0.509765625, "learning_rate": 2.020236087689713e-07, "loss": 0.0005, "reward": 3.8232831954956055, "reward_std": 0.032786943775136024, "rewards/final_reward": 1.728261051517324, "rewards/mask_iou_reward": 0.864130525758662, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8232831954956055, "rewards/thk_ans_format_reward": 1.0, "step": 2366, "think_completion_length": 46.8125 }, { "clip_ratio": 0.0, "completion_length": 112.5625, "epoch": 3.996627318718381, "grad_norm": 32.14007175303776, "kl": 0.57421875, "learning_rate": 2.0168634064080945e-07, "loss": 0.0006, "reward": 2.8483331203460693, "reward_std": 0.015060745645314455, "rewards/final_reward": 1.3362465005289894, "rewards/mask_iou_reward": 0.6681232502644947, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8483331203460693, "rewards/thk_ans_format_reward": 1.0, "step": 2367, "think_completion_length": 38.46875 }, { "clip_ratio": 0.0, "completion_length": 110.16666793823242, "epoch": 3.998313659359191, "grad_norm": 10.11380764879701, "kl": 0.751953125, "learning_rate": 2.0134907251264754e-07, "loss": 0.0007, "reward": 3.6402668952941895, "reward_std": 0.021844581700861454, "rewards/final_reward": 1.6423651190756967, "rewards/mask_iou_reward": 0.8211825595378484, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6402669548988342, "rewards/thk_ans_format_reward": 1.0, "step": 2368, "think_completion_length": 36.78125 }, { "clip_ratio": 0.0, "completion_length": 116.203125, "epoch": 4.001686340640809, "grad_norm": 11.480409829973361, "kl": 0.6015625, "learning_rate": 2.0101180438448565e-07, "loss": 0.0006, "reward": 3.5934234857559204, "reward_std": 0.0587493684142828, "rewards/final_reward": 1.948077071431705, "rewards/mask_iou_reward": 0.9740385357158525, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.59342360496521, "rewards/thk_ans_format_reward": 1.0, "step": 2369, "think_completion_length": 47.8125 }, { "clip_ratio": 0.0, "completion_length": 110.34375, "epoch": 4.003372681281619, "grad_norm": 8.238883165059963, "kl": 0.55859375, "learning_rate": 2.0067453625632376e-07, "loss": 0.0006, "reward": 3.468377709388733, "reward_std": 0.060116853564977646, "rewards/final_reward": 1.3180356249804, "rewards/mask_iou_reward": 0.6590178124902, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4683776497840881, "rewards/thk_ans_format_reward": 1.0, "step": 2370, "think_completion_length": 38.375 }, { "clip_ratio": 0.0, "completion_length": 112.515625, "epoch": 4.0050590219224285, "grad_norm": 12.92183070167942, "kl": 0.56640625, "learning_rate": 2.0033726812816188e-07, "loss": 0.0006, "reward": 3.150872826576233, "reward_std": 0.10184543719515204, "rewards/final_reward": 0.9877310882947764, "rewards/mask_iou_reward": 0.4938655441473882, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1508729159832, "rewards/thk_ans_format_reward": 1.0, "step": 2371, "think_completion_length": 44.9375 }, { "clip_ratio": 0.0, "completion_length": 145.28125, "epoch": 4.006745362563238, "grad_norm": 6.22318773334404, "kl": 0.5615234375, "learning_rate": 2e-07, "loss": 0.0006, "reward": 3.3521891832351685, "reward_std": 0.06817868165671825, "rewards/final_reward": 1.4573340237162564, "rewards/mask_iou_reward": 0.7286670118581282, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3521891832351685, "rewards/thk_ans_format_reward": 1.0, "step": 2372, "think_completion_length": 42.5625 }, { "clip_ratio": 0.0, "completion_length": 129.140625, "epoch": 4.008431703204047, "grad_norm": 6.22020827034915, "kl": 0.53515625, "learning_rate": 1.996627318718381e-07, "loss": 0.0005, "reward": 3.731974720954895, "reward_std": 0.0071526761166751385, "rewards/final_reward": 1.5606302272620618, "rewards/mask_iou_reward": 0.7803151136310309, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7319748997688293, "rewards/thk_ans_format_reward": 1.0, "step": 2373, "think_completion_length": 46.15625 }, { "clip_ratio": 0.0, "completion_length": 113.359375, "epoch": 4.010118043844857, "grad_norm": 15.966196109750083, "kl": 0.5546875, "learning_rate": 1.993254637436762e-07, "loss": 0.0005, "reward": 3.180173635482788, "reward_std": 0.2132977396249771, "rewards/final_reward": 1.401960101621973, "rewards/mask_iou_reward": 0.7009800508109865, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.195798546075821, "rewards/thk_ans_format_reward": 1.0, "step": 2374, "think_completion_length": 42.5 }, { "clip_ratio": 0.0, "completion_length": 120.828125, "epoch": 4.011804384485666, "grad_norm": 8.591836875153847, "kl": 0.578125, "learning_rate": 1.9898819561551433e-07, "loss": 0.0006, "reward": 3.532989501953125, "reward_std": 0.14260661602020264, "rewards/final_reward": 1.8065476158177485, "rewards/mask_iou_reward": 0.9032738079088742, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.532989501953125, "rewards/thk_ans_format_reward": 1.0, "step": 2375, "think_completion_length": 48.03125 }, { "clip_ratio": 0.0, "completion_length": 158.4375, "epoch": 4.013490725126475, "grad_norm": 6.403629080049531, "kl": 0.521484375, "learning_rate": 1.9865092748735242e-07, "loss": 0.0006, "reward": 2.971209168434143, "reward_std": 0.059162041172385216, "rewards/final_reward": 0.8289285008547014, "rewards/mask_iou_reward": 0.4144642504273507, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9712091982364655, "rewards/thk_ans_format_reward": 1.0, "step": 2376, "think_completion_length": 42.28125 }, { "clip_ratio": 0.0, "completion_length": 112.015625, "epoch": 4.015177065767285, "grad_norm": 6.753527939628968, "kl": 0.546875, "learning_rate": 1.9831365935919056e-07, "loss": 0.0006, "reward": 3.7322280406951904, "reward_std": 0.2169511088868603, "rewards/final_reward": 1.5525031955747912, "rewards/mask_iou_reward": 0.7762515977873956, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7322279810905457, "rewards/thk_ans_format_reward": 1.0, "step": 2377, "think_completion_length": 45.25 }, { "clip_ratio": 0.0, "completion_length": 112.359375, "epoch": 4.016863406408095, "grad_norm": 18.263143894412075, "kl": 0.611328125, "learning_rate": 1.9797639123102867e-07, "loss": 0.0006, "reward": 3.4608668088912964, "reward_std": 0.04524455638602376, "rewards/final_reward": 1.5373226020513886, "rewards/mask_iou_reward": 0.7686613010256943, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4608668088912964, "rewards/thk_ans_format_reward": 1.0, "step": 2378, "think_completion_length": 42.71875 }, { "clip_ratio": 0.0, "completion_length": 114.28125, "epoch": 4.018549747048904, "grad_norm": 487.8030454095207, "kl": 54.775390625, "learning_rate": 1.9763912310286676e-07, "loss": 0.0547, "reward": 3.39677631855011, "reward_std": 0.017723735887557268, "rewards/final_reward": 0.8353646910599449, "rewards/mask_iou_reward": 0.41768234552997247, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3967764377593994, "rewards/thk_ans_format_reward": 1.0, "step": 2379, "think_completion_length": 48.03125 }, { "clip_ratio": 0.0, "completion_length": 109.328125, "epoch": 4.020236087689713, "grad_norm": 11.546597629218004, "kl": 0.63671875, "learning_rate": 1.973018549747049e-07, "loss": 0.0006, "reward": 3.2893166542053223, "reward_std": 0.20702505111694336, "rewards/final_reward": 1.1057080017261876, "rewards/mask_iou_reward": 0.5528540008630938, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2893165946006775, "rewards/thk_ans_format_reward": 1.0, "step": 2380, "think_completion_length": 35.46875 }, { "clip_ratio": 0.0, "completion_length": 113.5625, "epoch": 4.021922428330523, "grad_norm": 12.940481833874944, "kl": 0.591796875, "learning_rate": 1.96964586846543e-07, "loss": 0.0006, "reward": 3.1833547353744507, "reward_std": 0.01602939050644636, "rewards/final_reward": 0.7768981750521698, "rewards/mask_iou_reward": 0.3884490875260849, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1833546161651611, "rewards/thk_ans_format_reward": 1.0, "step": 2381, "think_completion_length": 42.78125 }, { "clip_ratio": 0.0, "completion_length": 116.640625, "epoch": 4.023608768971332, "grad_norm": 37.607564987993854, "kl": 0.578125, "learning_rate": 1.966273187183811e-07, "loss": 0.0006, "reward": 3.6645880937576294, "reward_std": 0.05637491028755903, "rewards/final_reward": 1.4050240383685577, "rewards/mask_iou_reward": 0.7025120191842789, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.664588212966919, "rewards/thk_ans_format_reward": 1.0, "step": 2382, "think_completion_length": 41.21875 }, { "clip_ratio": 0.0, "completion_length": 113.28125, "epoch": 4.025295109612141, "grad_norm": 11.439961980420474, "kl": 0.548828125, "learning_rate": 1.9629005059021922e-07, "loss": 0.0005, "reward": 3.040262460708618, "reward_std": 0.0750212837010622, "rewards/final_reward": 0.9258982103314888, "rewards/mask_iou_reward": 0.4629491051657444, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0402624607086182, "rewards/thk_ans_format_reward": 1.0, "step": 2383, "think_completion_length": 44.46875 }, { "clip_ratio": 0.0, "completion_length": 143.890625, "epoch": 4.0269814502529515, "grad_norm": 4.516035931232741, "kl": 0.48828125, "learning_rate": 1.9595278246205733e-07, "loss": 0.0005, "reward": 3.7392314672470093, "reward_std": 0.1627311073243618, "rewards/final_reward": 1.717044571434485, "rewards/mask_iou_reward": 0.8585222857172425, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7392314672470093, "rewards/thk_ans_format_reward": 1.0, "step": 2384, "think_completion_length": 44.375 }, { "clip_ratio": 0.0, "completion_length": 112.90625, "epoch": 4.028667790893761, "grad_norm": 19.841883618522072, "kl": 0.568359375, "learning_rate": 1.9561551433389544e-07, "loss": 0.0006, "reward": 2.8191603422164917, "reward_std": 0.041978662833571434, "rewards/final_reward": 0.7025187247498247, "rewards/mask_iou_reward": 0.35125936237491234, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8191602826118469, "rewards/thk_ans_format_reward": 1.0, "step": 2385, "think_completion_length": 43.15625 }, { "clip_ratio": 0.0, "completion_length": 113.359375, "epoch": 4.03035413153457, "grad_norm": 7.039285605112933, "kl": 0.58984375, "learning_rate": 1.9527824620573356e-07, "loss": 0.0006, "reward": 3.5875240564346313, "reward_std": 0.3093913681805134, "rewards/final_reward": 1.5723538136182496, "rewards/mask_iou_reward": 0.7861769068091248, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.587524175643921, "rewards/thk_ans_format_reward": 1.0, "step": 2386, "think_completion_length": 43.5625 }, { "clip_ratio": 0.0, "completion_length": 166.90625, "epoch": 4.032040472175379, "grad_norm": 12.816288500352174, "kl": 0.5859375, "learning_rate": 1.9494097807757165e-07, "loss": 0.0006, "reward": 2.8287243843078613, "reward_std": 0.5208190828561783, "rewards/final_reward": 0.7635315952793571, "rewards/mask_iou_reward": 0.38176579763967855, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 0.8912242650985718, "rewards/thk_ans_format_reward": 0.96875, "step": 2387, "think_completion_length": 40.8125 }, { "clip_ratio": 0.0, "completion_length": 114.328125, "epoch": 4.033726812816189, "grad_norm": 25.36677976502114, "kl": 0.53125, "learning_rate": 1.9460370994940979e-07, "loss": 0.0005, "reward": 3.5308085680007935, "reward_std": 0.060908347368240356, "rewards/final_reward": 1.7344693208015038, "rewards/mask_iou_reward": 0.8672346604007519, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5308085680007935, "rewards/thk_ans_format_reward": 1.0, "step": 2388, "think_completion_length": 45.15625 }, { "clip_ratio": 0.0, "completion_length": 113.6875, "epoch": 4.035413153456998, "grad_norm": 14.45270041695548, "kl": 0.90234375, "learning_rate": 1.9426644182124787e-07, "loss": 0.0009, "reward": 3.292656898498535, "reward_std": 0.13113961927592754, "rewards/final_reward": 1.2586338570953175, "rewards/mask_iou_reward": 0.6293169285476587, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2926568984985352, "rewards/thk_ans_format_reward": 1.0, "step": 2389, "think_completion_length": 46.375 }, { "clip_ratio": 0.0, "completion_length": 125.484375, "epoch": 4.0370994940978076, "grad_norm": 7.587921954479646, "kl": 0.5048828125, "learning_rate": 1.9392917369308601e-07, "loss": 0.0005, "reward": 3.715697169303894, "reward_std": 0.11376471444964409, "rewards/final_reward": 1.812208194310015, "rewards/mask_iou_reward": 0.9061040971550075, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7156970500946045, "rewards/thk_ans_format_reward": 1.0, "step": 2390, "think_completion_length": 36.96875 }, { "clip_ratio": 0.0, "completion_length": 113.734375, "epoch": 4.038785834738618, "grad_norm": 26.996972484909367, "kl": 0.55859375, "learning_rate": 1.935919055649241e-07, "loss": 0.0006, "reward": 3.1315231323242188, "reward_std": 0.1370735252276063, "rewards/final_reward": 0.8673822408615481, "rewards/mask_iou_reward": 0.43369112043077407, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.131523072719574, "rewards/thk_ans_format_reward": 1.0, "step": 2391, "think_completion_length": 44.21875 }, { "clip_ratio": 0.0, "completion_length": 148.59375, "epoch": 4.040472175379427, "grad_norm": 7.6753970617393, "kl": 0.544921875, "learning_rate": 1.9325463743676222e-07, "loss": 0.0005, "reward": 3.4671186208724976, "reward_std": 0.04770086891949177, "rewards/final_reward": 1.8936233515745924, "rewards/mask_iou_reward": 0.9468116757872962, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4671186208724976, "rewards/thk_ans_format_reward": 1.0, "step": 2392, "think_completion_length": 42.53125 }, { "clip_ratio": 0.0, "completion_length": 111.4375, "epoch": 4.042158516020236, "grad_norm": 8.931034431762196, "kl": 0.578125, "learning_rate": 1.9291736930860033e-07, "loss": 0.0006, "reward": 3.7695319652557373, "reward_std": 0.21818761248141527, "rewards/final_reward": 1.7540229682748367, "rewards/mask_iou_reward": 0.8770114841374184, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7695319652557373, "rewards/thk_ans_format_reward": 1.0, "step": 2393, "think_completion_length": 43.15625 }, { "clip_ratio": 0.0, "completion_length": 151.484375, "epoch": 4.043844856661045, "grad_norm": 12.924364201978555, "kl": 1.615234375, "learning_rate": 1.9258010118043844e-07, "loss": 0.0016, "reward": 3.437775492668152, "reward_std": 0.25223083049058914, "rewards/final_reward": 1.2896175495769646, "rewards/mask_iou_reward": 0.6448087747884823, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.437775433063507, "rewards/thk_ans_format_reward": 1.0, "step": 2394, "think_completion_length": 42.78125 }, { "clip_ratio": 0.0, "completion_length": 112.640625, "epoch": 4.045531197301855, "grad_norm": 23.626398444759594, "kl": 0.56640625, "learning_rate": 1.9224283305227653e-07, "loss": 0.0006, "reward": 3.8039190769195557, "reward_std": 0.00993341370485723, "rewards/final_reward": 1.8160529556955798, "rewards/mask_iou_reward": 0.9080264778477899, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8039190769195557, "rewards/thk_ans_format_reward": 1.0, "step": 2395, "think_completion_length": 39.53125 }, { "clip_ratio": 0.0, "completion_length": 109.609375, "epoch": 4.0472175379426645, "grad_norm": 10.156897671568798, "kl": 0.55859375, "learning_rate": 1.9190556492411467e-07, "loss": 0.0006, "reward": 3.070542812347412, "reward_std": 0.06726253964006901, "rewards/final_reward": 0.5538109703120524, "rewards/mask_iou_reward": 0.2769054851560262, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0705427527427673, "rewards/thk_ans_format_reward": 1.0, "step": 2396, "think_completion_length": 41.28125 }, { "clip_ratio": 0.0, "completion_length": 136.109375, "epoch": 4.048903878583474, "grad_norm": 5.564434290948505, "kl": 0.623046875, "learning_rate": 1.9156829679595276e-07, "loss": 0.0006, "reward": 3.2643067836761475, "reward_std": 0.1904737390577793, "rewards/final_reward": 1.0842811559956482, "rewards/mask_iou_reward": 0.5421405779978241, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2643068432807922, "rewards/thk_ans_format_reward": 1.0, "step": 2397, "think_completion_length": 40.875 }, { "clip_ratio": 0.0, "completion_length": 111.140625, "epoch": 4.050590219224283, "grad_norm": 6.736417126637982, "kl": 0.5703125, "learning_rate": 1.912310286677909e-07, "loss": 0.0006, "reward": 3.818103551864624, "reward_std": 0.015501199522987008, "rewards/final_reward": 1.8088766443374351, "rewards/mask_iou_reward": 0.9044383221687176, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8181034922599792, "rewards/thk_ans_format_reward": 1.0, "step": 2398, "think_completion_length": 42.71875 }, { "clip_ratio": 0.0, "completion_length": 200.953125, "epoch": 4.052276559865093, "grad_norm": 9.671563346532743, "kl": 0.50390625, "learning_rate": 1.9089376053962899e-07, "loss": 0.0005, "reward": 3.338501453399658, "reward_std": 0.28296706080436707, "rewards/final_reward": 1.1677833458807307, "rewards/mask_iou_reward": 0.5838916729403654, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3385014832019806, "rewards/thk_ans_format_reward": 1.0, "step": 2399, "think_completion_length": 44.9375 }, { "clip_ratio": 0.0, "completion_length": 128.359375, "epoch": 4.053962900505902, "grad_norm": 12.746302672892734, "kl": 0.576171875, "learning_rate": 1.905564924114671e-07, "loss": 0.0006, "reward": 3.299188733100891, "reward_std": 0.3755532205104828, "rewards/final_reward": 1.1859492626462202, "rewards/mask_iou_reward": 0.5929746313231101, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2991888523101807, "rewards/thk_ans_format_reward": 1.0, "step": 2400, "think_completion_length": 41.34375 }, { "clip_ratio": 0.0, "completion_length": 184.46875, "epoch": 4.055649241146711, "grad_norm": 3.9789913504043333, "kl": 0.49609375, "learning_rate": 1.9021922428330521e-07, "loss": 0.0005, "reward": 3.257691979408264, "reward_std": 0.27566526364535093, "rewards/final_reward": 1.202514783245717, "rewards/mask_iou_reward": 0.6012573916228585, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.288942039012909, "rewards/thk_ans_format_reward": 0.984375, "step": 2401, "think_completion_length": 42.8125 }, { "clip_ratio": 0.0, "completion_length": 112.328125, "epoch": 4.057335581787521, "grad_norm": 6.355089159537488, "kl": 0.591796875, "learning_rate": 1.8988195615514333e-07, "loss": 0.0006, "reward": 3.548841118812561, "reward_std": 0.12714591436088085, "rewards/final_reward": 1.359696598663631, "rewards/mask_iou_reward": 0.6798482993318155, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5488411784172058, "rewards/thk_ans_format_reward": 1.0, "step": 2402, "think_completion_length": 36.90625 }, { "clip_ratio": 0.0, "completion_length": 107.796875, "epoch": 4.059021922428331, "grad_norm": 26.422139998000524, "kl": 0.62890625, "learning_rate": 1.8954468802698144e-07, "loss": 0.0006, "reward": 3.0529425144195557, "reward_std": 0.14811351895332336, "rewards/final_reward": 1.177692374918326, "rewards/mask_iou_reward": 0.588846187459163, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0529426038265228, "rewards/thk_ans_format_reward": 1.0, "step": 2403, "think_completion_length": 39.125 }, { "clip_ratio": 0.0, "completion_length": 114.140625, "epoch": 4.06070826306914, "grad_norm": 40.14506228897134, "kl": 0.58984375, "learning_rate": 1.8920741989881955e-07, "loss": 0.0006, "reward": 3.2226529121398926, "reward_std": 0.04350670985877514, "rewards/final_reward": 1.335124546034519, "rewards/mask_iou_reward": 0.6675622730172595, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2226530611515045, "rewards/thk_ans_format_reward": 1.0, "step": 2404, "think_completion_length": 41.65625 }, { "clip_ratio": 0.0, "completion_length": 148.890625, "epoch": 4.062394603709949, "grad_norm": 61.836424536738996, "kl": 0.609375, "learning_rate": 1.8887015177065764e-07, "loss": 0.0006, "reward": 3.7547478675842285, "reward_std": 0.054970819503068924, "rewards/final_reward": 1.634220214433795, "rewards/mask_iou_reward": 0.8171101072168975, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7547478675842285, "rewards/thk_ans_format_reward": 1.0, "step": 2405, "think_completion_length": 40.90625 }, { "clip_ratio": 0.0, "completion_length": 110.671875, "epoch": 4.064080944350759, "grad_norm": 8.199856621555274, "kl": 0.572265625, "learning_rate": 1.8853288364249578e-07, "loss": 0.0006, "reward": 3.1624104976654053, "reward_std": 0.19921143352985382, "rewards/final_reward": 1.2621524469381291, "rewards/mask_iou_reward": 0.6310762234690646, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.16241055727005, "rewards/thk_ans_format_reward": 1.0, "step": 2406, "think_completion_length": 41.96875 }, { "clip_ratio": 0.0, "completion_length": 110.796875, "epoch": 4.065767284991568, "grad_norm": 9.440365557766706, "kl": 0.548828125, "learning_rate": 1.8819561551433387e-07, "loss": 0.0006, "reward": 3.2724127769470215, "reward_std": 0.03121477458626032, "rewards/final_reward": 1.23542375517504, "rewards/mask_iou_reward": 0.61771187758752, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2724127173423767, "rewards/thk_ans_format_reward": 1.0, "step": 2407, "think_completion_length": 41.25 }, { "clip_ratio": 0.0, "completion_length": 112.078125, "epoch": 4.0674536256323774, "grad_norm": 27.360006415004754, "kl": 0.572265625, "learning_rate": 1.87858347386172e-07, "loss": 0.0006, "reward": 3.480781078338623, "reward_std": 0.05732197128236294, "rewards/final_reward": 1.324549316311801, "rewards/mask_iou_reward": 0.6622746581559005, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.480781078338623, "rewards/thk_ans_format_reward": 1.0, "step": 2408, "think_completion_length": 42.84375 }, { "clip_ratio": 0.0, "completion_length": 161.078125, "epoch": 4.0691399662731875, "grad_norm": 14.195257828449042, "kl": 0.630859375, "learning_rate": 1.8752107925801012e-07, "loss": 0.0006, "reward": 3.0159380435943604, "reward_std": 0.11699309200048447, "rewards/final_reward": 0.8030806539537556, "rewards/mask_iou_reward": 0.4015403269768778, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0315630435943604, "rewards/thk_ans_format_reward": 0.984375, "step": 2409, "think_completion_length": 42.6875 }, { "clip_ratio": 0.0, "completion_length": 115.28125, "epoch": 4.070826306913997, "grad_norm": 9.780196344080494, "kl": 0.552734375, "learning_rate": 1.871838111298482e-07, "loss": 0.0006, "reward": 3.4170422554016113, "reward_std": 0.10918816924095154, "rewards/final_reward": 1.6272089634097822, "rewards/mask_iou_reward": 0.8136044817048911, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4170424938201904, "rewards/thk_ans_format_reward": 1.0, "step": 2410, "think_completion_length": 46.25 }, { "clip_ratio": 0.0, "completion_length": 109.0, "epoch": 4.072512647554806, "grad_norm": 10.678234098227074, "kl": 0.61328125, "learning_rate": 1.8684654300168635e-07, "loss": 0.0006, "reward": 3.7279247045516968, "reward_std": 0.09347648662514985, "rewards/final_reward": 1.619307008372024, "rewards/mask_iou_reward": 0.809653504186012, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7279247045516968, "rewards/thk_ans_format_reward": 1.0, "step": 2411, "think_completion_length": 39.3125 }, { "clip_ratio": 0.0, "completion_length": 111.609375, "epoch": 4.074198988195615, "grad_norm": 10.855980004734976, "kl": 0.59375, "learning_rate": 1.8650927487352444e-07, "loss": 0.0006, "reward": 3.362097144126892, "reward_std": 0.1303092995658517, "rewards/final_reward": 1.6823154227344528, "rewards/mask_iou_reward": 0.8411577113672264, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3620972633361816, "rewards/thk_ans_format_reward": 1.0, "step": 2412, "think_completion_length": 38.40625 }, { "clip_ratio": 0.0, "completion_length": 112.109375, "epoch": 4.075885328836425, "grad_norm": 15.421168686239545, "kl": 0.58984375, "learning_rate": 1.8617200674536255e-07, "loss": 0.0006, "reward": 3.3645206689834595, "reward_std": 0.248811274766922, "rewards/final_reward": 1.2371536458376557, "rewards/mask_iou_reward": 0.6185768229188279, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3645207285881042, "rewards/thk_ans_format_reward": 1.0, "step": 2413, "think_completion_length": 42.0625 }, { "clip_ratio": 0.0, "completion_length": 124.890625, "epoch": 4.077571669477234, "grad_norm": 25.17182493148204, "kl": 0.529296875, "learning_rate": 1.8583473861720067e-07, "loss": 0.0006, "reward": 3.806470274925232, "reward_std": 0.0033699345076456666, "rewards/final_reward": 1.8741425356650039, "rewards/mask_iou_reward": 0.9370712678325019, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8064703345298767, "rewards/thk_ans_format_reward": 1.0, "step": 2414, "think_completion_length": 42.75 }, { "clip_ratio": 0.0, "completion_length": 112.84375, "epoch": 4.079258010118044, "grad_norm": 10.074641437628669, "kl": 0.5703125, "learning_rate": 1.8549747048903878e-07, "loss": 0.0006, "reward": 3.4285424947738647, "reward_std": 0.02089158445596695, "rewards/final_reward": 1.782670384576963, "rewards/mask_iou_reward": 0.8913351922884815, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4285423755645752, "rewards/thk_ans_format_reward": 1.0, "step": 2415, "think_completion_length": 43.3125 }, { "clip_ratio": 0.0, "completion_length": 141.65625, "epoch": 4.080944350758854, "grad_norm": 9.826004226753085, "kl": 0.580078125, "learning_rate": 1.851602023608769e-07, "loss": 0.0006, "reward": 3.0336287021636963, "reward_std": 0.09551212098449469, "rewards/final_reward": 0.7605734952695639, "rewards/mask_iou_reward": 0.38028674763478193, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0336288213729858, "rewards/thk_ans_format_reward": 1.0, "step": 2416, "think_completion_length": 45.15625 }, { "clip_ratio": 0.0, "completion_length": 134.921875, "epoch": 4.082630691399663, "grad_norm": 6.860726849627503, "kl": 0.552734375, "learning_rate": 1.84822934232715e-07, "loss": 0.0006, "reward": 3.3313989639282227, "reward_std": 0.06857777805998921, "rewards/final_reward": 1.7899774872625613, "rewards/mask_iou_reward": 0.8949887436312807, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.331398993730545, "rewards/thk_ans_format_reward": 1.0, "step": 2417, "think_completion_length": 42.4375 }, { "clip_ratio": 0.0, "completion_length": 211.484375, "epoch": 4.084317032040472, "grad_norm": 12.505909690671647, "kl": 0.533203125, "learning_rate": 1.844856661045531e-07, "loss": 0.0005, "reward": 3.3965485095977783, "reward_std": 0.3861938640475273, "rewards/final_reward": 1.1030617092141284, "rewards/mask_iou_reward": 0.5515308546070642, "rewards/sam_format_reward": 0.953125, "rewards/sam_reward_func_ultra": 1.505923569202423, "rewards/thk_ans_format_reward": 0.9375, "step": 2418, "think_completion_length": 46.6875 }, { "clip_ratio": 0.0, "completion_length": 112.765625, "epoch": 4.086003372681281, "grad_norm": 7.129779888498348, "kl": 0.625, "learning_rate": 1.8414839797639124e-07, "loss": 0.0006, "reward": 3.7287741899490356, "reward_std": 0.2574765458703041, "rewards/final_reward": 1.6428350439463504, "rewards/mask_iou_reward": 0.8214175219731752, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7287741303443909, "rewards/thk_ans_format_reward": 1.0, "step": 2419, "think_completion_length": 41.90625 }, { "clip_ratio": 0.0, "completion_length": 116.328125, "epoch": 4.087689713322091, "grad_norm": 9.125420264391623, "kl": 0.546875, "learning_rate": 1.8381112984822932e-07, "loss": 0.0005, "reward": 3.340920567512512, "reward_std": 0.35566695034503937, "rewards/final_reward": 0.8635012586914437, "rewards/mask_iou_reward": 0.43175062934572184, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3409205675125122, "rewards/thk_ans_format_reward": 1.0, "step": 2420, "think_completion_length": 41.3125 }, { "clip_ratio": 0.0, "completion_length": 146.25, "epoch": 4.0893760539629005, "grad_norm": 8.219293003863777, "kl": 0.603515625, "learning_rate": 1.8347386172006746e-07, "loss": 0.0006, "reward": 2.8761707544326782, "reward_std": 0.06319956108927727, "rewards/final_reward": 0.8135975983025594, "rewards/mask_iou_reward": 0.4067987991512797, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8761707842350006, "rewards/thk_ans_format_reward": 1.0, "step": 2421, "think_completion_length": 41.375 }, { "clip_ratio": 0.0, "completion_length": 125.859375, "epoch": 4.09106239460371, "grad_norm": 11.65638932013057, "kl": 0.544921875, "learning_rate": 1.8313659359190555e-07, "loss": 0.0005, "reward": 3.4241864681243896, "reward_std": 0.13773788139224052, "rewards/final_reward": 1.3498022886924497, "rewards/mask_iou_reward": 0.6749011443462248, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4241865277290344, "rewards/thk_ans_format_reward": 1.0, "step": 2422, "think_completion_length": 41.5 }, { "clip_ratio": 0.0, "completion_length": 123.6875, "epoch": 4.09274873524452, "grad_norm": 11.906764558427954, "kl": 0.60546875, "learning_rate": 1.8279932546374367e-07, "loss": 0.0006, "reward": 3.194682478904724, "reward_std": 0.2606370970606804, "rewards/final_reward": 1.521790364090617, "rewards/mask_iou_reward": 0.7608951820453085, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1946824789047241, "rewards/thk_ans_format_reward": 1.0, "step": 2423, "think_completion_length": 43.84375 }, { "clip_ratio": 0.0, "completion_length": 116.421875, "epoch": 4.094435075885329, "grad_norm": 11.78903892087091, "kl": 0.541015625, "learning_rate": 1.8246205733558178e-07, "loss": 0.0005, "reward": 3.5103652477264404, "reward_std": 0.00921852933242917, "rewards/final_reward": 1.670252714871993, "rewards/mask_iou_reward": 0.8351263574359965, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.51036536693573, "rewards/thk_ans_format_reward": 1.0, "step": 2424, "think_completion_length": 45.75 }, { "clip_ratio": 0.0, "completion_length": 108.828125, "epoch": 4.096121416526138, "grad_norm": 17.37244880870618, "kl": 0.796875, "learning_rate": 1.821247892074199e-07, "loss": 0.0008, "reward": 3.470105290412903, "reward_std": 0.09997964650392532, "rewards/final_reward": 1.7580551283626624, "rewards/mask_iou_reward": 0.8790275641813312, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4701053500175476, "rewards/thk_ans_format_reward": 1.0, "step": 2425, "think_completion_length": 38.28125 }, { "clip_ratio": 0.0, "completion_length": 151.34375, "epoch": 4.097807757166947, "grad_norm": 148.65573085337613, "kl": 0.609375, "learning_rate": 1.8178752107925798e-07, "loss": 0.0006, "reward": 2.7674933671951294, "reward_std": 0.30003902316093445, "rewards/final_reward": 0.5220741981410059, "rewards/mask_iou_reward": 0.26103709907050293, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 0.8299932479858398, "rewards/thk_ans_format_reward": 0.96875, "step": 2426, "think_completion_length": 38.21875 }, { "clip_ratio": 0.0, "completion_length": 114.5625, "epoch": 4.099494097807757, "grad_norm": 5.518980680485829, "kl": 0.564453125, "learning_rate": 1.8145025295109612e-07, "loss": 0.0006, "reward": 3.371211051940918, "reward_std": 0.15353204309940338, "rewards/final_reward": 1.6059161935315704, "rewards/mask_iou_reward": 0.8029580967657852, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3712111115455627, "rewards/thk_ans_format_reward": 1.0, "step": 2427, "think_completion_length": 47.90625 }, { "clip_ratio": 0.0, "completion_length": 112.140625, "epoch": 4.101180438448567, "grad_norm": 8.035275994452993, "kl": 0.50390625, "learning_rate": 1.811129848229342e-07, "loss": 0.0005, "reward": 3.208521842956543, "reward_std": 0.09160394221544266, "rewards/final_reward": 1.791497057160064, "rewards/mask_iou_reward": 0.895748528580032, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.208521842956543, "rewards/thk_ans_format_reward": 1.0, "step": 2428, "think_completion_length": 41.25 }, { "clip_ratio": 0.0, "completion_length": 98.28125, "epoch": 4.102866779089376, "grad_norm": 7.516752843962914, "kl": 0.541015625, "learning_rate": 1.8077571669477235e-07, "loss": 0.0005, "reward": 3.7405242919921875, "reward_std": 0.03224869258701801, "rewards/final_reward": 1.962668194004178, "rewards/mask_iou_reward": 0.981334097002089, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7405242919921875, "rewards/thk_ans_format_reward": 1.0, "step": 2429, "think_completion_length": 44.1875 }, { "clip_ratio": 0.0, "completion_length": 115.1875, "epoch": 4.104553119730186, "grad_norm": 44.142514705287155, "kl": 0.81640625, "learning_rate": 1.8043844856661044e-07, "loss": 0.0008, "reward": 3.570330858230591, "reward_std": 0.18656124360859394, "rewards/final_reward": 1.5419715342322102, "rewards/mask_iou_reward": 0.7709857671161051, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.570330560207367, "rewards/thk_ans_format_reward": 1.0, "step": 2430, "think_completion_length": 47.46875 }, { "clip_ratio": 0.0, "completion_length": 110.5625, "epoch": 4.106239460370995, "grad_norm": 12.125870111288556, "kl": 0.568359375, "learning_rate": 1.8010118043844855e-07, "loss": 0.0006, "reward": 3.7397871017456055, "reward_std": 0.06458837911486626, "rewards/final_reward": 1.7609626170576957, "rewards/mask_iou_reward": 0.8804813085288479, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.739786982536316, "rewards/thk_ans_format_reward": 1.0, "step": 2431, "think_completion_length": 36.28125 }, { "clip_ratio": 0.0, "completion_length": 113.71875, "epoch": 4.107925801011804, "grad_norm": 9.271025189840522, "kl": 0.59765625, "learning_rate": 1.7976391231028666e-07, "loss": 0.0006, "reward": 3.296180844306946, "reward_std": 0.10032219812273979, "rewards/final_reward": 0.969565839219187, "rewards/mask_iou_reward": 0.4847829196095935, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.296180784702301, "rewards/thk_ans_format_reward": 1.0, "step": 2432, "think_completion_length": 43.5625 }, { "clip_ratio": 0.0, "completion_length": 113.671875, "epoch": 4.1096121416526135, "grad_norm": 22.015537297219275, "kl": 0.560546875, "learning_rate": 1.7942664418212478e-07, "loss": 0.0006, "reward": 3.7425882816314697, "reward_std": 0.11254134774208069, "rewards/final_reward": 1.6666927241671812, "rewards/mask_iou_reward": 0.8333463620835906, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7425881624221802, "rewards/thk_ans_format_reward": 1.0, "step": 2433, "think_completion_length": 44.125 }, { "clip_ratio": 0.0, "completion_length": 111.46875, "epoch": 4.1112984822934235, "grad_norm": 10.396776683202178, "kl": 0.54296875, "learning_rate": 1.790893760539629e-07, "loss": 0.0006, "reward": 3.3835560083389282, "reward_std": 0.17423714324831963, "rewards/final_reward": 1.2915921295781196, "rewards/mask_iou_reward": 0.6457960647890598, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3835560083389282, "rewards/thk_ans_format_reward": 1.0, "step": 2434, "think_completion_length": 41.125 }, { "clip_ratio": 0.0, "completion_length": 115.375, "epoch": 4.112984822934233, "grad_norm": 7.691601405960013, "kl": 0.623046875, "learning_rate": 1.78752107925801e-07, "loss": 0.0006, "reward": 3.6533315181732178, "reward_std": 0.07725580548867583, "rewards/final_reward": 1.5506914329713433, "rewards/mask_iou_reward": 0.7753457164856716, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.653331458568573, "rewards/thk_ans_format_reward": 1.0, "step": 2435, "think_completion_length": 47.78125 }, { "clip_ratio": 0.0, "completion_length": 127.859375, "epoch": 4.114671163575042, "grad_norm": 4.890209137735515, "kl": 0.525390625, "learning_rate": 1.784148397976391e-07, "loss": 0.0005, "reward": 2.8774254322052, "reward_std": 0.21360297221690416, "rewards/final_reward": 1.0372495292692403, "rewards/mask_iou_reward": 0.5186247646346202, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8774254620075226, "rewards/thk_ans_format_reward": 1.0, "step": 2436, "think_completion_length": 46.09375 }, { "clip_ratio": 0.0, "completion_length": 123.953125, "epoch": 4.116357504215852, "grad_norm": 6.788718135979743, "kl": 0.5703125, "learning_rate": 1.7807757166947723e-07, "loss": 0.0006, "reward": 3.4717376232147217, "reward_std": 0.042540392372757196, "rewards/final_reward": 1.7633789684762051, "rewards/mask_iou_reward": 0.8816894842381026, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4717376828193665, "rewards/thk_ans_format_reward": 1.0, "step": 2437, "think_completion_length": 40.25 }, { "clip_ratio": 0.0, "completion_length": 113.796875, "epoch": 4.118043844856661, "grad_norm": 15.210072320250731, "kl": 0.646484375, "learning_rate": 1.7774030354131535e-07, "loss": 0.0006, "reward": 3.538661479949951, "reward_std": 0.12559128180146217, "rewards/final_reward": 1.2970506437435008, "rewards/mask_iou_reward": 0.6485253218717504, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5386614799499512, "rewards/thk_ans_format_reward": 1.0, "step": 2438, "think_completion_length": 44.71875 }, { "clip_ratio": 0.0, "completion_length": 127.296875, "epoch": 4.11973018549747, "grad_norm": 12.104083484914264, "kl": 0.541015625, "learning_rate": 1.7740303541315346e-07, "loss": 0.0005, "reward": 3.2933363914489746, "reward_std": 0.0966414324939251, "rewards/final_reward": 1.6069715814968468, "rewards/mask_iou_reward": 0.8034857907484234, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2933364510536194, "rewards/thk_ans_format_reward": 1.0, "step": 2439, "think_completion_length": 40.59375 }, { "clip_ratio": 0.0, "completion_length": 115.515625, "epoch": 4.12141652613828, "grad_norm": 5.781855397089921, "kl": 0.55078125, "learning_rate": 1.7706576728499157e-07, "loss": 0.0005, "reward": 3.274218440055847, "reward_std": 0.04520893655717373, "rewards/final_reward": 1.4331149297308339, "rewards/mask_iou_reward": 0.7165574648654169, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.27421835064888, "rewards/thk_ans_format_reward": 1.0, "step": 2440, "think_completion_length": 41.71875 }, { "clip_ratio": 0.0, "completion_length": 110.59375, "epoch": 4.12310286677909, "grad_norm": 8.760429862503628, "kl": 0.5859375, "learning_rate": 1.7672849915682966e-07, "loss": 0.0006, "reward": 3.4449740648269653, "reward_std": 0.18864751234650612, "rewards/final_reward": 1.3188366043049522, "rewards/mask_iou_reward": 0.6594183021524761, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4449740648269653, "rewards/thk_ans_format_reward": 1.0, "step": 2441, "think_completion_length": 43.65625 }, { "clip_ratio": 0.0, "completion_length": 108.515625, "epoch": 4.124789207419899, "grad_norm": 8.159581656181548, "kl": 0.5859375, "learning_rate": 1.763912310286678e-07, "loss": 0.0006, "reward": 3.2669503688812256, "reward_std": 0.11719108745455742, "rewards/final_reward": 1.5414467634716296, "rewards/mask_iou_reward": 0.7707233817358148, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2825754284858704, "rewards/thk_ans_format_reward": 0.984375, "step": 2442, "think_completion_length": 38.5 }, { "clip_ratio": 0.0, "completion_length": 113.375, "epoch": 4.126475548060708, "grad_norm": 6.441309063839323, "kl": 0.580078125, "learning_rate": 1.760539629005059e-07, "loss": 0.0006, "reward": 3.6620864868164062, "reward_std": 0.04250127053819597, "rewards/final_reward": 1.734774837092839, "rewards/mask_iou_reward": 0.8673874185464195, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6620864272117615, "rewards/thk_ans_format_reward": 1.0, "step": 2443, "think_completion_length": 41.5 }, { "clip_ratio": 0.0, "completion_length": 158.515625, "epoch": 4.128161888701518, "grad_norm": 5.310654249140433, "kl": 0.55078125, "learning_rate": 1.75716694772344e-07, "loss": 0.0006, "reward": 3.5324217081069946, "reward_std": 0.028282339684665203, "rewards/final_reward": 1.4161529472944359, "rewards/mask_iou_reward": 0.7080764736472179, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5324216485023499, "rewards/thk_ans_format_reward": 1.0, "step": 2444, "think_completion_length": 40.4375 }, { "clip_ratio": 0.0, "completion_length": 119.390625, "epoch": 4.129848229342327, "grad_norm": 14.968375346591747, "kl": 0.56640625, "learning_rate": 1.7537942664418212e-07, "loss": 0.0006, "reward": 3.473724842071533, "reward_std": 0.019786870572715998, "rewards/final_reward": 1.9477039381400956, "rewards/mask_iou_reward": 0.9738519690700478, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.473724901676178, "rewards/thk_ans_format_reward": 1.0, "step": 2445, "think_completion_length": 45.875 }, { "clip_ratio": 0.0, "completion_length": 97.28125, "epoch": 4.1315345699831365, "grad_norm": 11.14283779346565, "kl": 0.58984375, "learning_rate": 1.7504215851602023e-07, "loss": 0.0006, "reward": 3.430332660675049, "reward_std": 0.17202283442020416, "rewards/final_reward": 0.973097065771472, "rewards/mask_iou_reward": 0.486548532885736, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4303327202796936, "rewards/thk_ans_format_reward": 1.0, "step": 2446, "think_completion_length": 38.6875 }, { "clip_ratio": 0.0, "completion_length": 127.90625, "epoch": 4.133220910623946, "grad_norm": 10.88582244855959, "kl": 0.56640625, "learning_rate": 1.7470489038785835e-07, "loss": 0.0006, "reward": 3.647831082344055, "reward_std": 0.0973800290375948, "rewards/final_reward": 1.5795811030044342, "rewards/mask_iou_reward": 0.7897905515022171, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6478310227394104, "rewards/thk_ans_format_reward": 1.0, "step": 2447, "think_completion_length": 37.21875 }, { "clip_ratio": 0.0, "completion_length": 112.84375, "epoch": 4.134907251264756, "grad_norm": 6.072609038776836, "kl": 0.595703125, "learning_rate": 1.7436762225969646e-07, "loss": 0.0006, "reward": 3.5523641109466553, "reward_std": 0.023783092387020588, "rewards/final_reward": 1.6333426320749438, "rewards/mask_iou_reward": 0.8166713160374719, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5523640513420105, "rewards/thk_ans_format_reward": 1.0, "step": 2448, "think_completion_length": 43.46875 }, { "clip_ratio": 0.0, "completion_length": 104.765625, "epoch": 4.136593591905565, "grad_norm": 6.0999813540724555, "kl": 0.52734375, "learning_rate": 1.7403035413153455e-07, "loss": 0.0006, "reward": 3.1396783590316772, "reward_std": 0.20230736583471298, "rewards/final_reward": 1.4668547519450323, "rewards/mask_iou_reward": 0.7334273759725162, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1396782994270325, "rewards/thk_ans_format_reward": 1.0, "step": 2449, "think_completion_length": 45.625 }, { "clip_ratio": 0.0, "completion_length": 111.65625, "epoch": 4.138279932546374, "grad_norm": 37.05617431808596, "kl": 0.5400390625, "learning_rate": 1.7369308600337269e-07, "loss": 0.0005, "reward": 3.4970571994781494, "reward_std": 0.011977422516793013, "rewards/final_reward": 1.7754393479215091, "rewards/mask_iou_reward": 0.8877196739607546, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4970571994781494, "rewards/thk_ans_format_reward": 1.0, "step": 2450, "think_completion_length": 40.84375 }, { "clip_ratio": 0.0, "completion_length": 112.421875, "epoch": 4.139966273187184, "grad_norm": 19.32704554505315, "kl": 0.59375, "learning_rate": 1.7335581787521077e-07, "loss": 0.0006, "reward": 3.5168113708496094, "reward_std": 0.04148505628108978, "rewards/final_reward": 1.1838286003096194, "rewards/mask_iou_reward": 0.5919143001548097, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5168115496635437, "rewards/thk_ans_format_reward": 1.0, "step": 2451, "think_completion_length": 45.3125 }, { "clip_ratio": 0.0, "completion_length": 119.203125, "epoch": 4.141652613827993, "grad_norm": 10.432641541321457, "kl": 0.58203125, "learning_rate": 1.7301854974704891e-07, "loss": 0.0006, "reward": 3.374828338623047, "reward_std": 0.0938992714509368, "rewards/final_reward": 1.4010326022077564, "rewards/mask_iou_reward": 0.7005163011038782, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.374828279018402, "rewards/thk_ans_format_reward": 1.0, "step": 2452, "think_completion_length": 47.21875 }, { "clip_ratio": 0.0, "completion_length": 114.0, "epoch": 4.143338954468803, "grad_norm": 6.796553460953023, "kl": 0.580078125, "learning_rate": 1.72681281618887e-07, "loss": 0.0006, "reward": 3.440047025680542, "reward_std": 0.07851972058415413, "rewards/final_reward": 0.9987816165391428, "rewards/mask_iou_reward": 0.4993908082695714, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4400469660758972, "rewards/thk_ans_format_reward": 1.0, "step": 2453, "think_completion_length": 43.15625 }, { "clip_ratio": 0.0, "completion_length": 165.84375, "epoch": 4.145025295109612, "grad_norm": 138.67722008154183, "kl": 0.49609375, "learning_rate": 1.7234401349072512e-07, "loss": 0.0005, "reward": 3.4035454988479614, "reward_std": 0.1103200614452362, "rewards/final_reward": 1.0903468919454502, "rewards/mask_iou_reward": 0.5451734459727251, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4035455584526062, "rewards/thk_ans_format_reward": 1.0, "step": 2454, "think_completion_length": 39.71875 }, { "clip_ratio": 0.0, "completion_length": 114.375, "epoch": 4.146711635750422, "grad_norm": 11.689257540526738, "kl": 0.544921875, "learning_rate": 1.7200674536256323e-07, "loss": 0.0005, "reward": 3.2984225749969482, "reward_std": 0.030012394301593304, "rewards/final_reward": 0.8408413380636273, "rewards/mask_iou_reward": 0.42042066903181363, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2984225749969482, "rewards/thk_ans_format_reward": 1.0, "step": 2455, "think_completion_length": 45.375 }, { "clip_ratio": 0.0, "completion_length": 118.359375, "epoch": 4.148397976391231, "grad_norm": 11.278926743139742, "kl": 0.53515625, "learning_rate": 1.7166947723440134e-07, "loss": 0.0005, "reward": 3.174254894256592, "reward_std": 0.15206933487206697, "rewards/final_reward": 1.1872710606329515, "rewards/mask_iou_reward": 0.5936355303164758, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1742548644542694, "rewards/thk_ans_format_reward": 1.0, "step": 2456, "think_completion_length": 42.6875 }, { "clip_ratio": 0.0, "completion_length": 112.078125, "epoch": 4.15008431703204, "grad_norm": 18.17030426571636, "kl": 0.5625, "learning_rate": 1.7133220910623943e-07, "loss": 0.0006, "reward": 3.169532537460327, "reward_std": 0.39185091853141785, "rewards/final_reward": 1.2581744958762393, "rewards/mask_iou_reward": 0.6290872479381197, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.1851573884487152, "rewards/thk_ans_format_reward": 1.0, "step": 2457, "think_completion_length": 39.875 }, { "clip_ratio": 0.0, "completion_length": 115.375, "epoch": 4.15177065767285, "grad_norm": 8.148184638625576, "kl": 0.572265625, "learning_rate": 1.7099494097807757e-07, "loss": 0.0006, "reward": 3.594156265258789, "reward_std": 0.1541702593676746, "rewards/final_reward": 1.6085641885897508, "rewards/mask_iou_reward": 0.8042820942948754, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.594156265258789, "rewards/thk_ans_format_reward": 1.0, "step": 2458, "think_completion_length": 45.40625 }, { "clip_ratio": 0.0, "completion_length": 110.796875, "epoch": 4.1534569983136596, "grad_norm": 68.33425571048502, "kl": 0.58203125, "learning_rate": 1.7065767284991566e-07, "loss": 0.0006, "reward": 3.1055225133895874, "reward_std": 0.0070166842779144645, "rewards/final_reward": 0.7701352773841368, "rewards/mask_iou_reward": 0.3850676386920684, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1055223941802979, "rewards/thk_ans_format_reward": 1.0, "step": 2459, "think_completion_length": 43.21875 }, { "clip_ratio": 0.0, "completion_length": 106.0, "epoch": 4.155143338954469, "grad_norm": 6.293257412705822, "kl": 0.54296875, "learning_rate": 1.703204047217538e-07, "loss": 0.0005, "reward": 3.3578845262527466, "reward_std": 0.28066894970834255, "rewards/final_reward": 1.5450707270999193, "rewards/mask_iou_reward": 0.7725353635499597, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3578845262527466, "rewards/thk_ans_format_reward": 1.0, "step": 2460, "think_completion_length": 43.53125 }, { "clip_ratio": 0.0, "completion_length": 133.1875, "epoch": 4.156829679595278, "grad_norm": 58.09452291181341, "kl": 0.568359375, "learning_rate": 1.6998313659359189e-07, "loss": 0.0006, "reward": 3.3822141885757446, "reward_std": 0.05487864976748824, "rewards/final_reward": 1.1080669673818953, "rewards/mask_iou_reward": 0.5540334836909476, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3822141885757446, "rewards/thk_ans_format_reward": 1.0, "step": 2461, "think_completion_length": 45.40625 }, { "clip_ratio": 0.0, "completion_length": 110.75, "epoch": 4.158516020236088, "grad_norm": 14.957547077861275, "kl": 0.53515625, "learning_rate": 1.6964586846543e-07, "loss": 0.0005, "reward": 3.4989073276519775, "reward_std": 0.3742763102054596, "rewards/final_reward": 1.8025648484496086, "rewards/mask_iou_reward": 0.9012824242248043, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4989073872566223, "rewards/thk_ans_format_reward": 1.0, "step": 2462, "think_completion_length": 39.34375 }, { "clip_ratio": 0.0, "completion_length": 132.421875, "epoch": 4.160202360876897, "grad_norm": 10.672064968033249, "kl": 0.552734375, "learning_rate": 1.6930860033726811e-07, "loss": 0.0006, "reward": 3.3404901027679443, "reward_std": 0.11594452522695065, "rewards/final_reward": 1.3714671161693603, "rewards/mask_iou_reward": 0.6857335580846802, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3404901027679443, "rewards/thk_ans_format_reward": 1.0, "step": 2463, "think_completion_length": 42.1875 }, { "clip_ratio": 0.0, "completion_length": 110.390625, "epoch": 4.161888701517706, "grad_norm": 18.616831487721143, "kl": 0.599609375, "learning_rate": 1.6897133220910623e-07, "loss": 0.0006, "reward": 2.7729495763778687, "reward_std": 0.059840716421604156, "rewards/final_reward": 0.555567492024673, "rewards/mask_iou_reward": 0.2777837460123365, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7729496173560619, "rewards/thk_ans_format_reward": 1.0, "step": 2464, "think_completion_length": 41.90625 }, { "clip_ratio": 0.0, "completion_length": 128.265625, "epoch": 4.163575042158516, "grad_norm": 13.961223657906311, "kl": 0.6015625, "learning_rate": 1.6863406408094437e-07, "loss": 0.0006, "reward": 3.2185802459716797, "reward_std": 0.23981109261512756, "rewards/final_reward": 1.1825092552118748, "rewards/mask_iou_reward": 0.5912546276059374, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2185802459716797, "rewards/thk_ans_format_reward": 1.0, "step": 2465, "think_completion_length": 45.25 }, { "clip_ratio": 0.0, "completion_length": 163.9375, "epoch": 4.165261382799326, "grad_norm": 6.115827974735234, "kl": 0.494140625, "learning_rate": 1.6829679595278246e-07, "loss": 0.0005, "reward": 3.298157572746277, "reward_std": 0.22440132359042764, "rewards/final_reward": 1.529379808654268, "rewards/mask_iou_reward": 0.764689904327134, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2981576323509216, "rewards/thk_ans_format_reward": 1.0, "step": 2466, "think_completion_length": 41.28125 }, { "clip_ratio": 0.0, "completion_length": 124.421875, "epoch": 4.166947723440135, "grad_norm": 7.534310280986116, "kl": 0.65625, "learning_rate": 1.6795952782462057e-07, "loss": 0.0007, "reward": 3.225925326347351, "reward_std": 0.0442003165371716, "rewards/final_reward": 1.2835473520049743, "rewards/mask_iou_reward": 0.6417736760024871, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2259255051612854, "rewards/thk_ans_format_reward": 1.0, "step": 2467, "think_completion_length": 40.3125 }, { "clip_ratio": 0.0, "completion_length": 114.34375, "epoch": 4.168634064080944, "grad_norm": 8.641948808655975, "kl": 0.60546875, "learning_rate": 1.6762225969645868e-07, "loss": 0.0006, "reward": 3.5798412561416626, "reward_std": 0.09165813028812408, "rewards/final_reward": 1.9735194661697966, "rewards/mask_iou_reward": 0.9867597330848983, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.579841136932373, "rewards/thk_ans_format_reward": 1.0, "step": 2468, "think_completion_length": 43.625 }, { "clip_ratio": 0.0, "completion_length": 110.078125, "epoch": 4.170320404721754, "grad_norm": 10.071162143296714, "kl": 0.59765625, "learning_rate": 1.672849915682968e-07, "loss": 0.0006, "reward": 3.6934038400650024, "reward_std": 0.06623989366926253, "rewards/final_reward": 1.7010132420361757, "rewards/mask_iou_reward": 0.8505066210180878, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6934038996696472, "rewards/thk_ans_format_reward": 1.0, "step": 2469, "think_completion_length": 38.15625 }, { "clip_ratio": 0.0, "completion_length": 124.65625, "epoch": 4.172006745362563, "grad_norm": 7.700699025085925, "kl": 0.5703125, "learning_rate": 1.669477234401349e-07, "loss": 0.0006, "reward": 3.537416100502014, "reward_std": 0.07881812565028667, "rewards/final_reward": 1.569391147970117, "rewards/mask_iou_reward": 0.7846955739850585, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5374162197113037, "rewards/thk_ans_format_reward": 1.0, "step": 2470, "think_completion_length": 42.1875 }, { "clip_ratio": 0.0, "completion_length": 113.453125, "epoch": 4.1736930860033725, "grad_norm": 7.862675036034808, "kl": 0.650390625, "learning_rate": 1.6661045531197302e-07, "loss": 0.0007, "reward": 3.441859245300293, "reward_std": 0.11183974612504244, "rewards/final_reward": 1.3813556688252486, "rewards/mask_iou_reward": 0.6906778344126243, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4418591260910034, "rewards/thk_ans_format_reward": 1.0, "step": 2471, "think_completion_length": 40.0625 }, { "clip_ratio": 0.0, "completion_length": 111.671875, "epoch": 4.175379426644182, "grad_norm": 7.741602417773985, "kl": 0.669921875, "learning_rate": 1.662731871838111e-07, "loss": 0.0007, "reward": 3.3296186923980713, "reward_std": 0.20109844952821732, "rewards/final_reward": 1.199130153563452, "rewards/mask_iou_reward": 0.599565076781726, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3296188116073608, "rewards/thk_ans_format_reward": 1.0, "step": 2472, "think_completion_length": 42.25 }, { "clip_ratio": 0.0, "completion_length": 100.3125, "epoch": 4.177065767284992, "grad_norm": 14.966449408412934, "kl": 0.51953125, "learning_rate": 1.6593591905564925e-07, "loss": 0.0005, "reward": 3.700683116912842, "reward_std": 0.0946221414487809, "rewards/final_reward": 1.5414615577057402, "rewards/mask_iou_reward": 0.7707307788528701, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7006831169128418, "rewards/thk_ans_format_reward": 1.0, "step": 2473, "think_completion_length": 44.28125 }, { "clip_ratio": 0.0, "completion_length": 112.75, "epoch": 4.178752107925801, "grad_norm": 7.711478514273177, "kl": 0.578125, "learning_rate": 1.6559865092748734e-07, "loss": 0.0006, "reward": 3.4979605674743652, "reward_std": 0.05067999288439751, "rewards/final_reward": 1.0990489792559652, "rewards/mask_iou_reward": 0.5495244896279826, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4979605078697205, "rewards/thk_ans_format_reward": 1.0, "step": 2474, "think_completion_length": 41.875 }, { "clip_ratio": 0.0, "completion_length": 113.40625, "epoch": 4.18043844856661, "grad_norm": 13.284370242674193, "kl": 0.576171875, "learning_rate": 1.6526138279932545e-07, "loss": 0.0006, "reward": 3.54450786113739, "reward_std": 0.06997823715209961, "rewards/final_reward": 1.397211013913966, "rewards/mask_iou_reward": 0.698605506956983, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5445078611373901, "rewards/thk_ans_format_reward": 1.0, "step": 2475, "think_completion_length": 41.8125 }, { "clip_ratio": 0.0, "completion_length": 111.71875, "epoch": 4.18212478920742, "grad_norm": 7.64312945876125, "kl": 0.576171875, "learning_rate": 1.6492411467116357e-07, "loss": 0.0006, "reward": 3.5403631925582886, "reward_std": 0.08483145385980606, "rewards/final_reward": 1.8252513400297394, "rewards/mask_iou_reward": 0.9126256700148697, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5403631925582886, "rewards/thk_ans_format_reward": 1.0, "step": 2476, "think_completion_length": 42.71875 }, { "clip_ratio": 0.0, "completion_length": 114.3125, "epoch": 4.1838111298482294, "grad_norm": 7.755037495109282, "kl": 0.7109375, "learning_rate": 1.6458684654300168e-07, "loss": 0.0006, "reward": 3.5823196172714233, "reward_std": 0.06578903924673796, "rewards/final_reward": 1.3312047196853576, "rewards/mask_iou_reward": 0.6656023598426788, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5823196768760681, "rewards/thk_ans_format_reward": 1.0, "step": 2477, "think_completion_length": 44.9375 }, { "clip_ratio": 0.0, "completion_length": 114.890625, "epoch": 4.185497470489039, "grad_norm": 11.101933961115508, "kl": 0.603515625, "learning_rate": 1.642495784148398e-07, "loss": 0.0006, "reward": 3.231121063232422, "reward_std": 0.08925764262676239, "rewards/final_reward": 1.3585384057033247, "rewards/mask_iou_reward": 0.6792692028516624, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2311209440231323, "rewards/thk_ans_format_reward": 1.0, "step": 2478, "think_completion_length": 44.09375 }, { "clip_ratio": 0.0, "completion_length": 101.765625, "epoch": 4.187183811129848, "grad_norm": 12.23671466250251, "kl": 0.576171875, "learning_rate": 1.639123102866779e-07, "loss": 0.0006, "reward": 3.426143169403076, "reward_std": 0.22152304695919156, "rewards/final_reward": 1.0948263609425108, "rewards/mask_iou_reward": 0.5474131804712554, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4261431694030762, "rewards/thk_ans_format_reward": 1.0, "step": 2479, "think_completion_length": 39.8125 }, { "clip_ratio": 0.0, "completion_length": 149.0625, "epoch": 4.188870151770658, "grad_norm": 8.623741136224107, "kl": 0.53125, "learning_rate": 1.63575042158516e-07, "loss": 0.0005, "reward": 3.5016026496887207, "reward_std": 0.13351611513644457, "rewards/final_reward": 1.6067579257978652, "rewards/mask_iou_reward": 0.8033789628989326, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5016027092933655, "rewards/thk_ans_format_reward": 1.0, "step": 2480, "think_completion_length": 43.125 }, { "clip_ratio": 0.0, "completion_length": 111.625, "epoch": 4.190556492411467, "grad_norm": 11.972575662820923, "kl": 0.595703125, "learning_rate": 1.6323777403035414e-07, "loss": 0.0006, "reward": 3.379241704940796, "reward_std": 0.04323430173099041, "rewards/final_reward": 1.6247649723289799, "rewards/mask_iou_reward": 0.8123824861644899, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.379241704940796, "rewards/thk_ans_format_reward": 1.0, "step": 2481, "think_completion_length": 39.625 }, { "clip_ratio": 0.0, "completion_length": 110.96875, "epoch": 4.192242833052276, "grad_norm": 178.5208892805482, "kl": 0.5625, "learning_rate": 1.6290050590219222e-07, "loss": 0.0006, "reward": 3.587921619415283, "reward_std": 0.15851197019219398, "rewards/final_reward": 1.4456479511328377, "rewards/mask_iou_reward": 0.7228239755664189, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5879215002059937, "rewards/thk_ans_format_reward": 1.0, "step": 2482, "think_completion_length": 38.625 }, { "clip_ratio": 0.0, "completion_length": 151.484375, "epoch": 4.193929173693086, "grad_norm": 28.989221718594397, "kl": 3.2138671875, "learning_rate": 1.6256323777403036e-07, "loss": 0.0032, "reward": 3.507957935333252, "reward_std": 0.3725784122943878, "rewards/final_reward": 1.6507667553906227, "rewards/mask_iou_reward": 0.8253833776953113, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.5235828757286072, "rewards/thk_ans_format_reward": 1.0, "step": 2483, "think_completion_length": 42.875 }, { "clip_ratio": 0.0, "completion_length": 113.6875, "epoch": 4.195615514333896, "grad_norm": 12.249738021656217, "kl": 0.54296875, "learning_rate": 1.6222596964586845e-07, "loss": 0.0005, "reward": 3.28781259059906, "reward_std": 0.047589752823114395, "rewards/final_reward": 1.53295818019778, "rewards/mask_iou_reward": 0.76647909009889, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2878124713897705, "rewards/thk_ans_format_reward": 1.0, "step": 2484, "think_completion_length": 47.46875 }, { "clip_ratio": 0.0, "completion_length": 112.34375, "epoch": 4.197301854974705, "grad_norm": 6.0851324746702335, "kl": 0.560546875, "learning_rate": 1.6188870151770657e-07, "loss": 0.0006, "reward": 3.3584847450256348, "reward_std": 0.05267183552496135, "rewards/final_reward": 1.2067760562381666, "rewards/mask_iou_reward": 0.6033880281190833, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3584846258163452, "rewards/thk_ans_format_reward": 1.0, "step": 2485, "think_completion_length": 37.6875 }, { "clip_ratio": 0.0, "completion_length": 135.375, "epoch": 4.198988195615514, "grad_norm": 8.876805665524541, "kl": 0.5859375, "learning_rate": 1.6155143338954468e-07, "loss": 0.0006, "reward": 3.3225139379501343, "reward_std": 0.19829332828521729, "rewards/final_reward": 1.4276699296541213, "rewards/mask_iou_reward": 0.7138349648270607, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3225139379501343, "rewards/thk_ans_format_reward": 1.0, "step": 2486, "think_completion_length": 43.0 }, { "clip_ratio": 0.0, "completion_length": 120.53125, "epoch": 4.200674536256324, "grad_norm": 13.089534712596679, "kl": 0.701171875, "learning_rate": 1.612141652613828e-07, "loss": 0.0007, "reward": 3.582512617111206, "reward_std": 0.08671862166374922, "rewards/final_reward": 1.4499282316972055, "rewards/mask_iou_reward": 0.7249641158486028, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5825126767158508, "rewards/thk_ans_format_reward": 1.0, "step": 2487, "think_completion_length": 40.6875 }, { "clip_ratio": 0.0, "completion_length": 109.859375, "epoch": 4.202360876897133, "grad_norm": 11.136842154555854, "kl": 0.587890625, "learning_rate": 1.6087689713322088e-07, "loss": 0.0006, "reward": 3.25620174407959, "reward_std": 0.008030643686652184, "rewards/final_reward": 1.237968267670258, "rewards/mask_iou_reward": 0.618984133835129, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.256201684474945, "rewards/thk_ans_format_reward": 1.0, "step": 2488, "think_completion_length": 41.15625 }, { "clip_ratio": 0.0, "completion_length": 110.46875, "epoch": 4.204047217537942, "grad_norm": 4.78757444111789, "kl": 0.58984375, "learning_rate": 1.6053962900505902e-07, "loss": 0.0006, "reward": 3.3365434408187866, "reward_std": 0.09441791824065149, "rewards/final_reward": 0.8951354420755497, "rewards/mask_iou_reward": 0.44756772103777487, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3365434408187866, "rewards/thk_ans_format_reward": 1.0, "step": 2489, "think_completion_length": 39.875 }, { "clip_ratio": 0.0, "completion_length": 160.5, "epoch": 4.2057335581787525, "grad_norm": 15.842252314728153, "kl": 0.603515625, "learning_rate": 1.602023608768971e-07, "loss": 0.0006, "reward": 3.6544800996780396, "reward_std": 0.0645940825343132, "rewards/final_reward": 1.71414012021267, "rewards/mask_iou_reward": 0.857070060106335, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6544801592826843, "rewards/thk_ans_format_reward": 1.0, "step": 2490, "think_completion_length": 39.21875 }, { "clip_ratio": 0.0, "completion_length": 135.5625, "epoch": 4.207419898819562, "grad_norm": 5.957565108696784, "kl": 0.5341796875, "learning_rate": 1.5986509274873525e-07, "loss": 0.0005, "reward": 3.7944085597991943, "reward_std": 0.08072686195373535, "rewards/final_reward": 1.7540967487275085, "rewards/mask_iou_reward": 0.8770483743637543, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7944085597991943, "rewards/thk_ans_format_reward": 1.0, "step": 2491, "think_completion_length": 43.09375 }, { "clip_ratio": 0.0, "completion_length": 109.390625, "epoch": 4.209106239460371, "grad_norm": 17.524296887738412, "kl": 0.6015625, "learning_rate": 1.5952782462057334e-07, "loss": 0.0005, "reward": 3.2337619066238403, "reward_std": 0.11936390213668346, "rewards/final_reward": 1.1093958536501025, "rewards/mask_iou_reward": 0.5546979268250513, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2337620556354523, "rewards/thk_ans_format_reward": 1.0, "step": 2492, "think_completion_length": 47.125 }, { "clip_ratio": 0.0, "completion_length": 114.234375, "epoch": 4.21079258010118, "grad_norm": 26.279744573862683, "kl": 0.587890625, "learning_rate": 1.5919055649241145e-07, "loss": 0.0006, "reward": 3.144417881965637, "reward_std": 0.1013356763869524, "rewards/final_reward": 0.8731246791024757, "rewards/mask_iou_reward": 0.43656233955123785, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1444178223609924, "rewards/thk_ans_format_reward": 1.0, "step": 2493, "think_completion_length": 46.78125 }, { "clip_ratio": 0.0, "completion_length": 114.015625, "epoch": 4.21247892074199, "grad_norm": 6.877698788213373, "kl": 0.638671875, "learning_rate": 1.5885328836424956e-07, "loss": 0.0006, "reward": 3.605825662612915, "reward_std": 0.023511327803134918, "rewards/final_reward": 1.4181790648223518, "rewards/mask_iou_reward": 0.7090895324111759, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6058255434036255, "rewards/thk_ans_format_reward": 1.0, "step": 2494, "think_completion_length": 42.96875 }, { "clip_ratio": 0.0, "completion_length": 235.078125, "epoch": 4.214165261382799, "grad_norm": 5.386440086217419, "kl": 0.48046875, "learning_rate": 1.5851602023608768e-07, "loss": 0.0005, "reward": 3.653342127799988, "reward_std": 0.09512594901025295, "rewards/final_reward": 1.6142510430444, "rewards/mask_iou_reward": 0.8071255215222, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.653342068195343, "rewards/thk_ans_format_reward": 1.0, "step": 2495, "think_completion_length": 40.1875 }, { "clip_ratio": 0.0, "completion_length": 115.484375, "epoch": 4.2158516020236085, "grad_norm": 12.123805188988428, "kl": 0.603515625, "learning_rate": 1.5817875210792582e-07, "loss": 0.0006, "reward": 3.4331681728363037, "reward_std": 0.1368698626756668, "rewards/final_reward": 1.6940559228465024, "rewards/mask_iou_reward": 0.8470279614232512, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.433168113231659, "rewards/thk_ans_format_reward": 1.0, "step": 2496, "think_completion_length": 45.875 }, { "clip_ratio": 0.0, "completion_length": 111.90625, "epoch": 4.217537942664419, "grad_norm": 9.518964659803792, "kl": 0.576171875, "learning_rate": 1.578414839797639e-07, "loss": 0.0006, "reward": 3.535395383834839, "reward_std": 0.05820541735738516, "rewards/final_reward": 1.4662414891515558, "rewards/mask_iou_reward": 0.7331207445757779, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5353953838348389, "rewards/thk_ans_format_reward": 1.0, "step": 2497, "think_completion_length": 39.59375 }, { "clip_ratio": 0.0, "completion_length": 118.234375, "epoch": 4.219224283305228, "grad_norm": 7.8898539600019015, "kl": 0.78125, "learning_rate": 1.5750421585160202e-07, "loss": 0.0008, "reward": 3.6014208793640137, "reward_std": 0.017208684235811234, "rewards/final_reward": 1.6433232711184682, "rewards/mask_iou_reward": 0.8216616355592341, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6014208793640137, "rewards/thk_ans_format_reward": 1.0, "step": 2498, "think_completion_length": 44.8125 }, { "clip_ratio": 0.0, "completion_length": 128.25, "epoch": 4.220910623946037, "grad_norm": 5.996096425999949, "kl": 0.5859375, "learning_rate": 1.5716694772344013e-07, "loss": 0.0006, "reward": 3.2900307178497314, "reward_std": 0.4222857290878892, "rewards/final_reward": 0.8600683159762403, "rewards/mask_iou_reward": 0.43003415798812017, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2900307774543762, "rewards/thk_ans_format_reward": 1.0, "step": 2499, "think_completion_length": 50.59375 }, { "clip_ratio": 0.0, "completion_length": 113.03125, "epoch": 4.222596964586846, "grad_norm": 17.096377692982486, "kl": 0.59375, "learning_rate": 1.5682967959527825e-07, "loss": 0.0006, "reward": 3.5150952339172363, "reward_std": 0.08423554711043835, "rewards/final_reward": 1.6300621768573955, "rewards/mask_iou_reward": 0.8150310884286978, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5150951743125916, "rewards/thk_ans_format_reward": 1.0, "step": 2500, "think_completion_length": 42.65625 }, { "clip_ratio": 0.0, "completion_length": 113.390625, "epoch": 4.224283305227656, "grad_norm": 12.643943836787773, "kl": 0.583984375, "learning_rate": 1.5649241146711636e-07, "loss": 0.0006, "reward": 3.491933822631836, "reward_std": 0.1002687681466341, "rewards/final_reward": 1.338027084975392, "rewards/mask_iou_reward": 0.669013542487696, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.491933822631836, "rewards/thk_ans_format_reward": 1.0, "step": 2501, "think_completion_length": 40.28125 }, { "clip_ratio": 0.0, "completion_length": 112.0, "epoch": 4.2259696458684655, "grad_norm": 5.584911428828709, "kl": 0.5546875, "learning_rate": 1.5615514333895447e-07, "loss": 0.0005, "reward": 3.7746880054473877, "reward_std": 0.10910388245247304, "rewards/final_reward": 1.7356168712740996, "rewards/mask_iou_reward": 0.8678084356370498, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7746880054473877, "rewards/thk_ans_format_reward": 1.0, "step": 2502, "think_completion_length": 39.15625 }, { "clip_ratio": 0.0, "completion_length": 129.5, "epoch": 4.227655986509275, "grad_norm": 8.64637878813344, "kl": 0.640625, "learning_rate": 1.5581787521079256e-07, "loss": 0.0006, "reward": 3.6966729164123535, "reward_std": 0.04432039085077122, "rewards/final_reward": 1.5541293493811987, "rewards/mask_iou_reward": 0.7770646746905994, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6966729164123535, "rewards/thk_ans_format_reward": 1.0, "step": 2503, "think_completion_length": 42.125 }, { "clip_ratio": 0.0, "completion_length": 114.390625, "epoch": 4.229342327150085, "grad_norm": 11.840944781081609, "kl": 0.61328125, "learning_rate": 1.554806070826307e-07, "loss": 0.0006, "reward": 3.6088430881500244, "reward_std": 0.04924646159633994, "rewards/final_reward": 1.4039237413250574, "rewards/mask_iou_reward": 0.7019618706625287, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.608843207359314, "rewards/thk_ans_format_reward": 1.0, "step": 2504, "think_completion_length": 47.46875 }, { "clip_ratio": 0.0, "completion_length": 114.875, "epoch": 4.231028667790894, "grad_norm": 6.119799545502641, "kl": 0.69921875, "learning_rate": 1.551433389544688e-07, "loss": 0.0007, "reward": 3.3514033555984497, "reward_std": 0.1196487583220005, "rewards/final_reward": 1.2155702096087642, "rewards/mask_iou_reward": 0.6077851048043821, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3514032363891602, "rewards/thk_ans_format_reward": 1.0, "step": 2505, "think_completion_length": 46.4375 }, { "clip_ratio": 0.0, "completion_length": 111.59375, "epoch": 4.232715008431703, "grad_norm": 76.93348122363103, "kl": 0.568359375, "learning_rate": 1.548060708263069e-07, "loss": 0.0006, "reward": 3.7489959001541138, "reward_std": 0.15297742490656674, "rewards/final_reward": 1.7989957097859706, "rewards/mask_iou_reward": 0.8994978548929853, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7489957809448242, "rewards/thk_ans_format_reward": 1.0, "step": 2506, "think_completion_length": 41.6875 }, { "clip_ratio": 0.0, "completion_length": 175.09375, "epoch": 4.234401349072512, "grad_norm": 10.290057729479736, "kl": 0.548828125, "learning_rate": 1.5446880269814502e-07, "loss": 0.0006, "reward": 3.3262614011764526, "reward_std": 0.11492184177041054, "rewards/final_reward": 1.1012222042789321, "rewards/mask_iou_reward": 0.5506111021394661, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3262612223625183, "rewards/thk_ans_format_reward": 1.0, "step": 2507, "think_completion_length": 45.1875 }, { "clip_ratio": 0.0, "completion_length": 153.359375, "epoch": 4.236087689713322, "grad_norm": 5.491342812996931, "kl": 0.482421875, "learning_rate": 1.5413153456998313e-07, "loss": 0.0005, "reward": 3.218244433403015, "reward_std": 0.14265291579067707, "rewards/final_reward": 0.9250615596363921, "rewards/mask_iou_reward": 0.46253077981819607, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2182443737983704, "rewards/thk_ans_format_reward": 1.0, "step": 2508, "think_completion_length": 41.84375 }, { "clip_ratio": 0.0, "completion_length": 121.578125, "epoch": 4.237774030354132, "grad_norm": 8.256158244019627, "kl": 0.560546875, "learning_rate": 1.5379426644182125e-07, "loss": 0.0006, "reward": 3.20368230342865, "reward_std": 0.033036405220627785, "rewards/final_reward": 0.9446484949510479, "rewards/mask_iou_reward": 0.47232424747552393, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2036822140216827, "rewards/thk_ans_format_reward": 1.0, "step": 2509, "think_completion_length": 43.46875 }, { "clip_ratio": 0.0, "completion_length": 105.578125, "epoch": 4.239460370994941, "grad_norm": 10.240281506736242, "kl": 0.580078125, "learning_rate": 1.5345699831365936e-07, "loss": 0.0006, "reward": 3.5657538175582886, "reward_std": 0.4374186247587204, "rewards/final_reward": 1.715664315244335, "rewards/mask_iou_reward": 0.8578321576221675, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5657540559768677, "rewards/thk_ans_format_reward": 1.0, "step": 2510, "think_completion_length": 45.0 }, { "clip_ratio": 0.0, "completion_length": 123.765625, "epoch": 4.24114671163575, "grad_norm": 18.110017710636633, "kl": 0.607421875, "learning_rate": 1.5311973018549745e-07, "loss": 0.0006, "reward": 3.4502862691879272, "reward_std": 0.2845083028078079, "rewards/final_reward": 1.2480094396741797, "rewards/mask_iou_reward": 0.6240047198370898, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4502861499786377, "rewards/thk_ans_format_reward": 1.0, "step": 2511, "think_completion_length": 46.4375 }, { "clip_ratio": 0.0, "completion_length": 146.578125, "epoch": 4.24283305227656, "grad_norm": 13.761643496398287, "kl": 0.53515625, "learning_rate": 1.527824620573356e-07, "loss": 0.0006, "reward": 3.0231703519821167, "reward_std": 0.06616012193262577, "rewards/final_reward": 1.2360284179623469, "rewards/mask_iou_reward": 0.6180142089811734, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0231703221797943, "rewards/thk_ans_format_reward": 1.0, "step": 2512, "think_completion_length": 40.65625 }, { "clip_ratio": 0.0, "completion_length": 115.875, "epoch": 4.244519392917369, "grad_norm": 16.104599863716597, "kl": 0.52734375, "learning_rate": 1.5244519392917367e-07, "loss": 0.0005, "reward": 3.3228198289871216, "reward_std": 0.13831503875553608, "rewards/final_reward": 0.949170678148781, "rewards/mask_iou_reward": 0.4745853390743905, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3228199481964111, "rewards/thk_ans_format_reward": 1.0, "step": 2513, "think_completion_length": 47.40625 }, { "clip_ratio": 0.0, "completion_length": 242.03125, "epoch": 4.246205733558178, "grad_norm": 6.788400625001057, "kl": 0.5, "learning_rate": 1.5210792580101181e-07, "loss": 0.0005, "reward": 3.593444347381592, "reward_std": 0.060188669711351395, "rewards/final_reward": 1.4872439563748985, "rewards/mask_iou_reward": 0.7436219781874492, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5934444069862366, "rewards/thk_ans_format_reward": 1.0, "step": 2514, "think_completion_length": 44.25 }, { "clip_ratio": 0.0, "completion_length": 126.671875, "epoch": 4.2478920741989885, "grad_norm": 9.103261919282126, "kl": 0.5859375, "learning_rate": 1.517706576728499e-07, "loss": 0.0006, "reward": 3.8253079652786255, "reward_std": 0.04512532241642475, "rewards/final_reward": 1.76673815732204, "rewards/mask_iou_reward": 0.88336907866102, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8253080248832703, "rewards/thk_ans_format_reward": 1.0, "step": 2515, "think_completion_length": 41.3125 }, { "clip_ratio": 0.0, "completion_length": 189.765625, "epoch": 4.249578414839798, "grad_norm": 9.591163151149308, "kl": 0.4736328125, "learning_rate": 1.5143338954468802e-07, "loss": 0.0005, "reward": 3.2388182878494263, "reward_std": 0.13420674204826355, "rewards/final_reward": 1.490955949030143, "rewards/mask_iou_reward": 0.7454779745150715, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.238818347454071, "rewards/thk_ans_format_reward": 1.0, "step": 2516, "think_completion_length": 41.46875 }, { "clip_ratio": 0.0, "completion_length": 103.515625, "epoch": 4.251264755480607, "grad_norm": 6.9616532584685835, "kl": 0.576171875, "learning_rate": 1.5109612141652613e-07, "loss": 0.0006, "reward": 3.499723196029663, "reward_std": 0.24417180567979813, "rewards/final_reward": 1.3162671110819626, "rewards/mask_iou_reward": 0.6581335555409813, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4997231364250183, "rewards/thk_ans_format_reward": 1.0, "step": 2517, "think_completion_length": 40.84375 }, { "clip_ratio": 0.0, "completion_length": 111.5625, "epoch": 4.252951096121416, "grad_norm": 7.3075343624463684, "kl": 0.603515625, "learning_rate": 1.5075885328836424e-07, "loss": 0.0006, "reward": 3.616728186607361, "reward_std": 0.013671865686774254, "rewards/final_reward": 1.4692096036141498, "rewards/mask_iou_reward": 0.7346048018070749, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6167281866073608, "rewards/thk_ans_format_reward": 1.0, "step": 2518, "think_completion_length": 42.9375 }, { "clip_ratio": 0.0, "completion_length": 152.15625, "epoch": 4.254637436762226, "grad_norm": 9.351763371801017, "kl": 0.546875, "learning_rate": 1.5042158516020233e-07, "loss": 0.0005, "reward": 3.1668334007263184, "reward_std": 0.02611308917403221, "rewards/final_reward": 1.299961387052443, "rewards/mask_iou_reward": 0.6499806935262215, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1668334603309631, "rewards/thk_ans_format_reward": 1.0, "step": 2519, "think_completion_length": 45.59375 }, { "clip_ratio": 0.0, "completion_length": 159.09375, "epoch": 4.256323777403035, "grad_norm": 22.596767977071785, "kl": 0.541015625, "learning_rate": 1.5008431703204047e-07, "loss": 0.0005, "reward": 3.0126872062683105, "reward_std": 0.11716877296566963, "rewards/final_reward": 1.1052302769538236, "rewards/mask_iou_reward": 0.5526151384769118, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0126870572566986, "rewards/thk_ans_format_reward": 1.0, "step": 2520, "think_completion_length": 41.875 }, { "clip_ratio": 0.0, "completion_length": 101.84375, "epoch": 4.2580101180438445, "grad_norm": 38.95245679064524, "kl": 0.57421875, "learning_rate": 1.4974704890387856e-07, "loss": 0.0006, "reward": 3.564436197280884, "reward_std": 0.24005300551652908, "rewards/final_reward": 1.551153769610408, "rewards/mask_iou_reward": 0.775576884805204, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5644361972808838, "rewards/thk_ans_format_reward": 1.0, "step": 2521, "think_completion_length": 42.53125 }, { "clip_ratio": 0.0, "completion_length": 117.953125, "epoch": 4.259696458684655, "grad_norm": 12.630445249364401, "kl": 0.53515625, "learning_rate": 1.494097807757167e-07, "loss": 0.0005, "reward": 3.7122442722320557, "reward_std": 0.03813726641237736, "rewards/final_reward": 1.8947568158228092, "rewards/mask_iou_reward": 0.9473784079114046, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7122442722320557, "rewards/thk_ans_format_reward": 1.0, "step": 2522, "think_completion_length": 48.3125 }, { "clip_ratio": 0.0, "completion_length": 114.375, "epoch": 4.261382799325464, "grad_norm": 7.167206354353222, "kl": 0.61328125, "learning_rate": 1.4907251264755479e-07, "loss": 0.0006, "reward": 3.8314647674560547, "reward_std": 0.010048975702375174, "rewards/final_reward": 1.8508296433772948, "rewards/mask_iou_reward": 0.9254148216886474, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.83146470785141, "rewards/thk_ans_format_reward": 1.0, "step": 2523, "think_completion_length": 41.53125 }, { "clip_ratio": 0.0, "completion_length": 116.359375, "epoch": 4.263069139966273, "grad_norm": 9.268448682232227, "kl": 0.61328125, "learning_rate": 1.487352445193929e-07, "loss": 0.0006, "reward": 3.4042723178863525, "reward_std": 0.22066151723265648, "rewards/final_reward": 1.3187135732367026, "rewards/mask_iou_reward": 0.6593567866183513, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4042723178863525, "rewards/thk_ans_format_reward": 1.0, "step": 2524, "think_completion_length": 44.0 }, { "clip_ratio": 0.0, "completion_length": 109.578125, "epoch": 4.264755480607082, "grad_norm": 9.88039804584237, "kl": 0.556640625, "learning_rate": 1.4839797639123104e-07, "loss": 0.0006, "reward": 3.5861204862594604, "reward_std": 0.09150342643260956, "rewards/final_reward": 1.6389049707249272, "rewards/mask_iou_reward": 0.8194524853624636, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.586120367050171, "rewards/thk_ans_format_reward": 1.0, "step": 2525, "think_completion_length": 38.40625 }, { "clip_ratio": 0.0, "completion_length": 117.921875, "epoch": 4.266441821247892, "grad_norm": 15.943676525441946, "kl": 0.59765625, "learning_rate": 1.4806070826306913e-07, "loss": 0.0006, "reward": 3.186599612236023, "reward_std": 0.08903985074721277, "rewards/final_reward": 1.1325104068487184, "rewards/mask_iou_reward": 0.5662552034243592, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1865995526313782, "rewards/thk_ans_format_reward": 1.0, "step": 2526, "think_completion_length": 45.5 }, { "clip_ratio": 0.0, "completion_length": 178.25, "epoch": 4.2681281618887015, "grad_norm": 12.867814490080903, "kl": 0.5390625, "learning_rate": 1.4772344013490727e-07, "loss": 0.0005, "reward": 2.867884874343872, "reward_std": 0.09711506590247154, "rewards/final_reward": 0.8848869288997784, "rewards/mask_iou_reward": 0.4424434644498892, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8678848743438721, "rewards/thk_ans_format_reward": 1.0, "step": 2527, "think_completion_length": 41.53125 }, { "clip_ratio": 0.0, "completion_length": 115.9375, "epoch": 4.269814502529511, "grad_norm": 18.68930223402884, "kl": 0.59765625, "learning_rate": 1.4738617200674536e-07, "loss": 0.0006, "reward": 3.5304031372070312, "reward_std": 0.1704651303589344, "rewards/final_reward": 1.887731740474944, "rewards/mask_iou_reward": 0.943865870237472, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5304031372070312, "rewards/thk_ans_format_reward": 1.0, "step": 2528, "think_completion_length": 43.90625 }, { "clip_ratio": 0.0, "completion_length": 120.59375, "epoch": 4.271500843170321, "grad_norm": 7.147821963050068, "kl": 0.73046875, "learning_rate": 1.4704890387858347e-07, "loss": 0.0007, "reward": 3.2026796340942383, "reward_std": 0.2595672160387039, "rewards/final_reward": 1.6295541597620495, "rewards/mask_iou_reward": 0.8147770798810248, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.202679693698883, "rewards/thk_ans_format_reward": 1.0, "step": 2529, "think_completion_length": 40.90625 }, { "clip_ratio": 0.0, "completion_length": 111.109375, "epoch": 4.27318718381113, "grad_norm": 8.772915963221111, "kl": 0.65625, "learning_rate": 1.4671163575042158e-07, "loss": 0.0007, "reward": 3.244017481803894, "reward_std": 0.1081763282418251, "rewards/final_reward": 1.2652142418705377, "rewards/mask_iou_reward": 0.6326071209352688, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2440175414085388, "rewards/thk_ans_format_reward": 1.0, "step": 2530, "think_completion_length": 43.0625 }, { "clip_ratio": 0.0, "completion_length": 111.921875, "epoch": 4.274873524451939, "grad_norm": 14.419830166789696, "kl": 0.599609375, "learning_rate": 1.463743676222597e-07, "loss": 0.0006, "reward": 2.8510278463363647, "reward_std": 0.06465473957359791, "rewards/final_reward": 0.6454403339958547, "rewards/mask_iou_reward": 0.3227201669979273, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8510278761386871, "rewards/thk_ans_format_reward": 1.0, "step": 2531, "think_completion_length": 41.59375 }, { "clip_ratio": 0.0, "completion_length": 111.8125, "epoch": 4.276559865092748, "grad_norm": 30.39215143435121, "kl": 0.60546875, "learning_rate": 1.4603709949409778e-07, "loss": 0.0006, "reward": 3.8287590742111206, "reward_std": 0.10854035732336342, "rewards/final_reward": 1.8429312055969538, "rewards/mask_iou_reward": 0.9214656027984769, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8287590742111206, "rewards/thk_ans_format_reward": 1.0, "step": 2532, "think_completion_length": 40.03125 }, { "clip_ratio": 0.0, "completion_length": 138.125, "epoch": 4.278246205733558, "grad_norm": 6.361875856345234, "kl": 0.5078125, "learning_rate": 1.4569983136593593e-07, "loss": 0.0005, "reward": 3.4332364797592163, "reward_std": 0.13735715672373772, "rewards/final_reward": 0.897129527052885, "rewards/mask_iou_reward": 0.4485647635264425, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.433236539363861, "rewards/thk_ans_format_reward": 1.0, "step": 2533, "think_completion_length": 45.15625 }, { "clip_ratio": 0.0, "completion_length": 117.78125, "epoch": 4.279932546374368, "grad_norm": 7.905426948466219, "kl": 0.533203125, "learning_rate": 1.45362563237774e-07, "loss": 0.0005, "reward": 3.7020827531814575, "reward_std": 0.2393525391817093, "rewards/final_reward": 1.5801495385519464, "rewards/mask_iou_reward": 0.7900747692759732, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7020828127861023, "rewards/thk_ans_format_reward": 1.0, "step": 2534, "think_completion_length": 41.78125 }, { "clip_ratio": 0.0, "completion_length": 112.359375, "epoch": 4.281618887015177, "grad_norm": 21.16779988755827, "kl": 0.58984375, "learning_rate": 1.4502529510961215e-07, "loss": 0.0006, "reward": 3.296600341796875, "reward_std": 0.11001870781183243, "rewards/final_reward": 0.994581774113384, "rewards/mask_iou_reward": 0.497290887056692, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.296600341796875, "rewards/thk_ans_format_reward": 1.0, "step": 2535, "think_completion_length": 42.5625 }, { "clip_ratio": 0.0, "completion_length": 137.15625, "epoch": 4.283305227655987, "grad_norm": 34.200597672922946, "kl": 0.525390625, "learning_rate": 1.4468802698145024e-07, "loss": 0.0006, "reward": 3.4385606050491333, "reward_std": 0.1196369118988514, "rewards/final_reward": 1.5373945337333406, "rewards/mask_iou_reward": 0.7686972668666703, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4385607242584229, "rewards/thk_ans_format_reward": 1.0, "step": 2536, "think_completion_length": 39.15625 }, { "clip_ratio": 0.0, "completion_length": 114.8125, "epoch": 4.284991568296796, "grad_norm": 8.527478808694564, "kl": 0.5546875, "learning_rate": 1.4435075885328835e-07, "loss": 0.0006, "reward": 3.402387022972107, "reward_std": 0.13465053914114833, "rewards/final_reward": 1.165708602414731, "rewards/mask_iou_reward": 0.5828543012073655, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4023869633674622, "rewards/thk_ans_format_reward": 1.0, "step": 2537, "think_completion_length": 38.3125 }, { "clip_ratio": 0.0, "completion_length": 123.140625, "epoch": 4.286677908937605, "grad_norm": 7.455663700142302, "kl": 0.5859375, "learning_rate": 1.4401349072512647e-07, "loss": 0.0006, "reward": 3.49160373210907, "reward_std": 0.08658642042428255, "rewards/final_reward": 1.4236609122565655, "rewards/mask_iou_reward": 0.7118304561282828, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4916037321090698, "rewards/thk_ans_format_reward": 1.0, "step": 2538, "think_completion_length": 42.15625 }, { "clip_ratio": 0.0, "completion_length": 122.703125, "epoch": 4.2883642495784144, "grad_norm": 7.536923008415111, "kl": 0.59765625, "learning_rate": 1.4367622259696458e-07, "loss": 0.0006, "reward": 3.7058597803115845, "reward_std": 0.0078626349568367, "rewards/final_reward": 1.8276194686022689, "rewards/mask_iou_reward": 0.9138097343011344, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7058596014976501, "rewards/thk_ans_format_reward": 1.0, "step": 2539, "think_completion_length": 41.03125 }, { "clip_ratio": 0.0, "completion_length": 134.484375, "epoch": 4.2900505902192245, "grad_norm": 7.481946672048057, "kl": 0.568359375, "learning_rate": 1.433389544688027e-07, "loss": 0.0006, "reward": 3.454736828804016, "reward_std": 0.10277345031499863, "rewards/final_reward": 1.76114098749309, "rewards/mask_iou_reward": 0.880570493746545, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4547368288040161, "rewards/thk_ans_format_reward": 1.0, "step": 2540, "think_completion_length": 46.46875 }, { "clip_ratio": 0.0, "completion_length": 169.3125, "epoch": 4.291736930860034, "grad_norm": 23.34801402220961, "kl": 0.470703125, "learning_rate": 1.430016863406408e-07, "loss": 0.0005, "reward": 3.748860239982605, "reward_std": 0.05715477233752608, "rewards/final_reward": 1.6972026995457257, "rewards/mask_iou_reward": 0.8486013497728628, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.748860478401184, "rewards/thk_ans_format_reward": 1.0, "step": 2541, "think_completion_length": 39.8125 }, { "clip_ratio": 0.0, "completion_length": 112.703125, "epoch": 4.293423271500843, "grad_norm": 7.4552507036217355, "kl": 0.578125, "learning_rate": 1.426644182124789e-07, "loss": 0.0005, "reward": 3.2460508346557617, "reward_std": 0.09356878604739904, "rewards/final_reward": 1.2559207078245218, "rewards/mask_iou_reward": 0.6279603539122609, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.246050864458084, "rewards/thk_ans_format_reward": 1.0, "step": 2542, "think_completion_length": 37.6875 }, { "clip_ratio": 0.0, "completion_length": 115.125, "epoch": 4.295109612141653, "grad_norm": 10.08757853605006, "kl": 0.5546875, "learning_rate": 1.4232715008431704e-07, "loss": 0.0004, "reward": 3.878480553627014, "reward_std": 0.010050483266240917, "rewards/final_reward": 1.878368428642306, "rewards/mask_iou_reward": 0.939184214321153, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8784804940223694, "rewards/thk_ans_format_reward": 1.0, "step": 2543, "think_completion_length": 46.71875 }, { "clip_ratio": 0.0, "completion_length": 113.890625, "epoch": 4.296795952782462, "grad_norm": 11.763135638839753, "kl": 0.611328125, "learning_rate": 1.4198988195615512e-07, "loss": 0.0006, "reward": 3.5589308738708496, "reward_std": 0.05203424580395222, "rewards/final_reward": 1.8302980244920706, "rewards/mask_iou_reward": 0.9151490122460353, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.55893075466156, "rewards/thk_ans_format_reward": 1.0, "step": 2544, "think_completion_length": 42.6875 }, { "clip_ratio": 0.0, "completion_length": 143.40625, "epoch": 4.298482293423271, "grad_norm": 24.036707132251024, "kl": 0.509765625, "learning_rate": 1.4165261382799326e-07, "loss": 0.0005, "reward": 3.25858473777771, "reward_std": 0.04267970938235521, "rewards/final_reward": 1.1185800974288043, "rewards/mask_iou_reward": 0.5592900487144021, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.258584588766098, "rewards/thk_ans_format_reward": 1.0, "step": 2545, "think_completion_length": 42.71875 }, { "clip_ratio": 0.0, "completion_length": 159.640625, "epoch": 4.300168634064081, "grad_norm": 7.313306979254338, "kl": 0.548828125, "learning_rate": 1.4131534569983135e-07, "loss": 0.0006, "reward": 3.6531234979629517, "reward_std": 0.046843864023685455, "rewards/final_reward": 1.6615908392074423, "rewards/mask_iou_reward": 0.8307954196037212, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.653123378753662, "rewards/thk_ans_format_reward": 1.0, "step": 2546, "think_completion_length": 40.40625 }, { "clip_ratio": 0.0, "completion_length": 120.984375, "epoch": 4.301854974704891, "grad_norm": 12.271146225861086, "kl": 0.556640625, "learning_rate": 1.4097807757166947e-07, "loss": 0.0006, "reward": 3.4084343910217285, "reward_std": 0.17545827478170395, "rewards/final_reward": 1.5821885705577525, "rewards/mask_iou_reward": 0.7910942852788763, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4084343910217285, "rewards/thk_ans_format_reward": 1.0, "step": 2547, "think_completion_length": 40.3125 }, { "clip_ratio": 0.0, "completion_length": 115.078125, "epoch": 4.3035413153457, "grad_norm": 6.176229533488846, "kl": 0.623046875, "learning_rate": 1.4064080944350758e-07, "loss": 0.0006, "reward": 3.5700900554656982, "reward_std": 0.14789751917123795, "rewards/final_reward": 1.482314263419597, "rewards/mask_iou_reward": 0.7411571317097985, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.570090115070343, "rewards/thk_ans_format_reward": 1.0, "step": 2548, "think_completion_length": 42.3125 }, { "clip_ratio": 0.0, "completion_length": 111.875, "epoch": 4.305227655986509, "grad_norm": 13.243385182075865, "kl": 0.58984375, "learning_rate": 1.403035413153457e-07, "loss": 0.0006, "reward": 3.7301249504089355, "reward_std": 0.1095227412879467, "rewards/final_reward": 1.5400399812358647, "rewards/mask_iou_reward": 0.7700199906179324, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7301249504089355, "rewards/thk_ans_format_reward": 1.0, "step": 2549, "think_completion_length": 37.21875 }, { "clip_ratio": 0.0, "completion_length": 113.0625, "epoch": 4.306913996627319, "grad_norm": 14.557143592798408, "kl": 0.611328125, "learning_rate": 1.3996627318718378e-07, "loss": 0.0006, "reward": 3.8407446146011353, "reward_std": 0.014246857725083828, "rewards/final_reward": 1.929611008155712, "rewards/mask_iou_reward": 0.964805504077856, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8407444953918457, "rewards/thk_ans_format_reward": 1.0, "step": 2550, "think_completion_length": 39.8125 }, { "clip_ratio": 0.0, "completion_length": 110.21875, "epoch": 4.308600337268128, "grad_norm": 7.4757362832284615, "kl": 0.5546875, "learning_rate": 1.3962900505902192e-07, "loss": 0.0006, "reward": 3.779623031616211, "reward_std": 0.0036925169406458735, "rewards/final_reward": 1.8417629323380633, "rewards/mask_iou_reward": 0.9208814661690317, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7796229720115662, "rewards/thk_ans_format_reward": 1.0, "step": 2551, "think_completion_length": 41.1875 }, { "clip_ratio": 0.0, "completion_length": 111.953125, "epoch": 4.3102866779089375, "grad_norm": 17.75758948815859, "kl": 0.609375, "learning_rate": 1.3929173693086e-07, "loss": 0.0006, "reward": 3.1587305068969727, "reward_std": 0.024611515924334526, "rewards/final_reward": 1.088152989525369, "rewards/mask_iou_reward": 0.5440764947626845, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1587304770946503, "rewards/thk_ans_format_reward": 1.0, "step": 2552, "think_completion_length": 41.375 }, { "clip_ratio": 0.0, "completion_length": 172.5625, "epoch": 4.311973018549747, "grad_norm": 9.284946231608782, "kl": 0.4990234375, "learning_rate": 1.3895446880269815e-07, "loss": 0.0005, "reward": 3.5739543437957764, "reward_std": 0.07344475947320461, "rewards/final_reward": 1.3897168341802106, "rewards/mask_iou_reward": 0.6948584170901053, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5739544034004211, "rewards/thk_ans_format_reward": 1.0, "step": 2553, "think_completion_length": 43.71875 }, { "clip_ratio": 0.0, "completion_length": 114.6875, "epoch": 4.313659359190557, "grad_norm": 18.14816708538634, "kl": 0.59375, "learning_rate": 1.3861720067453624e-07, "loss": 0.0007, "reward": 3.52009117603302, "reward_std": 0.15692077949643135, "rewards/final_reward": 1.8011629865816787, "rewards/mask_iou_reward": 0.9005814932908394, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5200912952423096, "rewards/thk_ans_format_reward": 1.0, "step": 2554, "think_completion_length": 41.96875 }, { "clip_ratio": 0.0, "completion_length": 108.5, "epoch": 4.315345699831366, "grad_norm": 26.487807469250537, "kl": 0.5859375, "learning_rate": 1.3827993254637435e-07, "loss": 0.0006, "reward": 2.9428672790527344, "reward_std": 0.07025075890123844, "rewards/final_reward": 1.0398247604871105, "rewards/mask_iou_reward": 0.5199123802435552, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9428671598434448, "rewards/thk_ans_format_reward": 1.0, "step": 2555, "think_completion_length": 36.40625 }, { "clip_ratio": 0.0, "completion_length": 220.21875, "epoch": 4.317032040472175, "grad_norm": 5.5807297955268576, "kl": 0.458984375, "learning_rate": 1.379426644182125e-07, "loss": 0.0005, "reward": 3.097283959388733, "reward_std": 0.4455588236451149, "rewards/final_reward": 0.8390536942603471, "rewards/mask_iou_reward": 0.4195268471301736, "rewards/sam_format_reward": 0.953125, "rewards/sam_reward_func_ultra": 1.191033959388733, "rewards/thk_ans_format_reward": 0.953125, "step": 2556, "think_completion_length": 47.15625 }, { "clip_ratio": 0.0, "completion_length": 113.890625, "epoch": 4.318718381112985, "grad_norm": 8.374934525891868, "kl": 0.693359375, "learning_rate": 1.3760539629005058e-07, "loss": 0.0007, "reward": 3.074215888977051, "reward_std": 0.0957517126807943, "rewards/final_reward": 1.4687460351048571, "rewards/mask_iou_reward": 0.7343730175524286, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0742157995700836, "rewards/thk_ans_format_reward": 1.0, "step": 2557, "think_completion_length": 46.5 }, { "clip_ratio": 0.0, "completion_length": 161.28125, "epoch": 4.320404721753794, "grad_norm": 6.11302937139873, "kl": 0.5458984375, "learning_rate": 1.3726812816188872e-07, "loss": 0.0005, "reward": 3.7231526374816895, "reward_std": 0.07856985554099083, "rewards/final_reward": 1.6635399623394522, "rewards/mask_iou_reward": 0.8317699811697261, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7231525778770447, "rewards/thk_ans_format_reward": 1.0, "step": 2558, "think_completion_length": 45.9375 }, { "clip_ratio": 0.0, "completion_length": 112.1875, "epoch": 4.322091062394604, "grad_norm": 25.653913441069957, "kl": 0.80078125, "learning_rate": 1.369308600337268e-07, "loss": 0.0008, "reward": 3.6056541204452515, "reward_std": 0.0955022219568491, "rewards/final_reward": 1.5772293602598686, "rewards/mask_iou_reward": 0.7886146801299343, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6056540608406067, "rewards/thk_ans_format_reward": 1.0, "step": 2559, "think_completion_length": 41.21875 }, { "clip_ratio": 0.0, "completion_length": 119.390625, "epoch": 4.323777403035413, "grad_norm": 13.998568715322113, "kl": 0.578125, "learning_rate": 1.3659359190556492e-07, "loss": 0.0006, "reward": 3.3204551935195923, "reward_std": 0.06416707020252943, "rewards/final_reward": 1.3612900585715115, "rewards/mask_iou_reward": 0.6806450292857558, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3204552829265594, "rewards/thk_ans_format_reward": 1.0, "step": 2560, "think_completion_length": 45.125 }, { "clip_ratio": 0.0, "completion_length": 129.171875, "epoch": 4.325463743676223, "grad_norm": 9.274193578613756, "kl": 0.541015625, "learning_rate": 1.3625632377740303e-07, "loss": 0.0005, "reward": 3.2127416133880615, "reward_std": 0.17955580353736877, "rewards/final_reward": 1.5627788016579989, "rewards/mask_iou_reward": 0.7813894008289994, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2283666729927063, "rewards/thk_ans_format_reward": 0.984375, "step": 2561, "think_completion_length": 43.59375 }, { "clip_ratio": 0.0, "completion_length": 115.796875, "epoch": 4.327150084317032, "grad_norm": 6.325127294286153, "kl": 0.5390625, "learning_rate": 1.3591905564924115e-07, "loss": 0.0005, "reward": 2.9596033096313477, "reward_std": 0.14582211151719093, "rewards/final_reward": 0.9671578490251985, "rewards/mask_iou_reward": 0.48357892451259926, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9596033245325089, "rewards/thk_ans_format_reward": 1.0, "step": 2562, "think_completion_length": 48.34375 }, { "clip_ratio": 0.0, "completion_length": 109.9375, "epoch": 4.328836424957841, "grad_norm": 7.20547111501158, "kl": 0.619140625, "learning_rate": 1.3558178752107923e-07, "loss": 0.0007, "reward": 3.8558907508850098, "reward_std": 0.01761279860511422, "rewards/final_reward": 1.7874402635111308, "rewards/mask_iou_reward": 0.8937201317555654, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8558905720710754, "rewards/thk_ans_format_reward": 1.0, "step": 2563, "think_completion_length": 41.75 }, { "clip_ratio": 0.0, "completion_length": 114.4375, "epoch": 4.330522765598651, "grad_norm": 12.15445085266186, "kl": 0.564453125, "learning_rate": 1.3524451939291738e-07, "loss": 0.0006, "reward": 3.4675991535186768, "reward_std": 0.12800533324480057, "rewards/final_reward": 1.8438290283183836, "rewards/mask_iou_reward": 0.9219145141591918, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4675992727279663, "rewards/thk_ans_format_reward": 1.0, "step": 2564, "think_completion_length": 44.25 }, { "clip_ratio": 0.0, "completion_length": 111.625, "epoch": 4.3322091062394605, "grad_norm": 22.58463662805342, "kl": 0.734375, "learning_rate": 1.3490725126475546e-07, "loss": 0.0006, "reward": 3.4586970806121826, "reward_std": 0.05200528213754296, "rewards/final_reward": 0.9920415770419135, "rewards/mask_iou_reward": 0.49602078852095677, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.458696961402893, "rewards/thk_ans_format_reward": 1.0, "step": 2565, "think_completion_length": 42.4375 }, { "clip_ratio": 0.0, "completion_length": 140.296875, "epoch": 4.33389544688027, "grad_norm": 8.36367289134029, "kl": 0.5703125, "learning_rate": 1.345699831365936e-07, "loss": 0.0006, "reward": 3.0413581132888794, "reward_std": 0.2051805593073368, "rewards/final_reward": 1.6693194437875627, "rewards/mask_iou_reward": 0.8346597218937813, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0413581728935242, "rewards/thk_ans_format_reward": 1.0, "step": 2566, "think_completion_length": 40.90625 }, { "clip_ratio": 0.0, "completion_length": 110.609375, "epoch": 4.335581787521079, "grad_norm": 20.45512318736674, "kl": 0.6328125, "learning_rate": 1.342327150084317e-07, "loss": 0.0006, "reward": 3.7222063541412354, "reward_std": 0.133345490321517, "rewards/final_reward": 1.667593703850473, "rewards/mask_iou_reward": 0.8337968519252364, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.722206175327301, "rewards/thk_ans_format_reward": 1.0, "step": 2567, "think_completion_length": 40.09375 }, { "clip_ratio": 0.0, "completion_length": 111.09375, "epoch": 4.337268128161889, "grad_norm": 7.646885091994143, "kl": 0.548828125, "learning_rate": 1.338954468802698e-07, "loss": 0.0005, "reward": 3.547218680381775, "reward_std": 0.12953244149684906, "rewards/final_reward": 1.7914925262084225, "rewards/mask_iou_reward": 0.8957462631042112, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5628437995910645, "rewards/thk_ans_format_reward": 0.984375, "step": 2568, "think_completion_length": 38.375 }, { "clip_ratio": 0.0, "completion_length": 113.8125, "epoch": 4.338954468802698, "grad_norm": 25.772815722112515, "kl": 2.287109375, "learning_rate": 1.3355817875210792e-07, "loss": 0.0023, "reward": 3.621209144592285, "reward_std": 0.02371341548860073, "rewards/final_reward": 1.2751909611401688, "rewards/mask_iou_reward": 0.6375954805700844, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6212090253829956, "rewards/thk_ans_format_reward": 1.0, "step": 2569, "think_completion_length": 42.1875 }, { "clip_ratio": 0.0, "completion_length": 111.5, "epoch": 4.340640809443507, "grad_norm": 7.315934620760691, "kl": 0.63671875, "learning_rate": 1.3322091062394603e-07, "loss": 0.0006, "reward": 3.53988254070282, "reward_std": 0.07551046088337898, "rewards/final_reward": 1.3149476657336536, "rewards/mask_iou_reward": 0.6574738328668268, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5398826003074646, "rewards/thk_ans_format_reward": 1.0, "step": 2570, "think_completion_length": 42.125 }, { "clip_ratio": 0.0, "completion_length": 110.890625, "epoch": 4.3423271500843175, "grad_norm": 14.984992365474874, "kl": 0.669921875, "learning_rate": 1.3288364249578415e-07, "loss": 0.0007, "reward": 3.1896384954452515, "reward_std": 0.024790717288851738, "rewards/final_reward": 1.6793651401459635, "rewards/mask_iou_reward": 0.8396825700729817, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1896384358406067, "rewards/thk_ans_format_reward": 1.0, "step": 2571, "think_completion_length": 40.84375 }, { "clip_ratio": 0.0, "completion_length": 112.5625, "epoch": 4.344013490725127, "grad_norm": 11.588965605464049, "kl": 0.603515625, "learning_rate": 1.3254637436762226e-07, "loss": 0.0006, "reward": 3.5344064235687256, "reward_std": 0.08637862093746662, "rewards/final_reward": 1.5520570082396912, "rewards/mask_iou_reward": 0.7760285041198456, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5344064235687256, "rewards/thk_ans_format_reward": 1.0, "step": 2572, "think_completion_length": 44.0 }, { "clip_ratio": 0.0, "completion_length": 114.140625, "epoch": 4.345699831365936, "grad_norm": 22.296246109705102, "kl": 0.623046875, "learning_rate": 1.3220910623946035e-07, "loss": 0.0006, "reward": 3.57146418094635, "reward_std": 0.010422832798212767, "rewards/final_reward": 1.2223940986785407, "rewards/mask_iou_reward": 0.6111970493392703, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5714643001556396, "rewards/thk_ans_format_reward": 1.0, "step": 2573, "think_completion_length": 43.84375 }, { "clip_ratio": 0.0, "completion_length": 113.203125, "epoch": 4.347386172006745, "grad_norm": 13.634815728942376, "kl": 0.578125, "learning_rate": 1.318718381112985e-07, "loss": 0.0006, "reward": 3.5449386835098267, "reward_std": 0.05515991151332855, "rewards/final_reward": 1.2598893903355834, "rewards/mask_iou_reward": 0.6299446951677917, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.544938564300537, "rewards/thk_ans_format_reward": 1.0, "step": 2574, "think_completion_length": 46.5625 }, { "clip_ratio": 0.0, "completion_length": 116.203125, "epoch": 4.349072512647555, "grad_norm": 8.606075795859018, "kl": 0.60546875, "learning_rate": 1.3153456998313657e-07, "loss": 0.0007, "reward": 3.432936906814575, "reward_std": 0.14704424515366554, "rewards/final_reward": 1.7833737091697128, "rewards/mask_iou_reward": 0.8916868545848564, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.432936668395996, "rewards/thk_ans_format_reward": 1.0, "step": 2575, "think_completion_length": 49.15625 }, { "clip_ratio": 0.0, "completion_length": 113.78125, "epoch": 4.350758853288364, "grad_norm": 14.03312107127187, "kl": 0.572265625, "learning_rate": 1.3119730185497472e-07, "loss": 0.0006, "reward": 3.3003830909729004, "reward_std": 0.2684507966041565, "rewards/final_reward": 1.754745717307658, "rewards/mask_iou_reward": 0.877372858653829, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3003832697868347, "rewards/thk_ans_format_reward": 1.0, "step": 2576, "think_completion_length": 44.71875 }, { "clip_ratio": 0.0, "completion_length": 113.921875, "epoch": 4.3524451939291735, "grad_norm": 9.563513685502976, "kl": 0.599609375, "learning_rate": 1.308600337268128e-07, "loss": 0.0006, "reward": 3.6351295709609985, "reward_std": 0.033396379090845585, "rewards/final_reward": 1.834490034594674, "rewards/mask_iou_reward": 0.917245017297337, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6351295113563538, "rewards/thk_ans_format_reward": 1.0, "step": 2577, "think_completion_length": 43.5 }, { "clip_ratio": 0.0, "completion_length": 110.96875, "epoch": 4.354131534569984, "grad_norm": 9.94042408918705, "kl": 0.6171875, "learning_rate": 1.3052276559865092e-07, "loss": 0.0007, "reward": 3.280154228210449, "reward_std": 0.08011075738613727, "rewards/final_reward": 1.7138860590023135, "rewards/mask_iou_reward": 0.8569430295011567, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2801542282104492, "rewards/thk_ans_format_reward": 1.0, "step": 2578, "think_completion_length": 41.65625 }, { "clip_ratio": 0.0, "completion_length": 115.09375, "epoch": 4.355817875210793, "grad_norm": 15.375330400711096, "kl": 0.5859375, "learning_rate": 1.3018549747048903e-07, "loss": 0.0006, "reward": 3.6954853534698486, "reward_std": 0.016146198846399784, "rewards/final_reward": 1.574730574071296, "rewards/mask_iou_reward": 0.787365287035648, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.695485234260559, "rewards/thk_ans_format_reward": 1.0, "step": 2579, "think_completion_length": 45.375 }, { "clip_ratio": 0.0, "completion_length": 110.234375, "epoch": 4.357504215851602, "grad_norm": 14.048520592159923, "kl": 0.59765625, "learning_rate": 1.2984822934232714e-07, "loss": 0.0006, "reward": 2.707605004310608, "reward_std": 0.04226122272666544, "rewards/final_reward": 1.4152098919735496, "rewards/mask_iou_reward": 0.7076049459867748, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7076049447059631, "rewards/thk_ans_format_reward": 1.0, "step": 2580, "think_completion_length": 46.84375 }, { "clip_ratio": 0.0, "completion_length": 109.015625, "epoch": 4.359190556492411, "grad_norm": 9.831350349061688, "kl": 0.61328125, "learning_rate": 1.2951096121416523e-07, "loss": 0.0007, "reward": 3.4770760536193848, "reward_std": 0.1615507616661489, "rewards/final_reward": 1.2932119503645565, "rewards/mask_iou_reward": 0.6466059751822782, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.47707599401474, "rewards/thk_ans_format_reward": 1.0, "step": 2581, "think_completion_length": 38.8125 }, { "clip_ratio": 0.0, "completion_length": 102.359375, "epoch": 4.360876897133221, "grad_norm": 11.50889866748189, "kl": 0.591796875, "learning_rate": 1.2917369308600337e-07, "loss": 0.0006, "reward": 3.314904808998108, "reward_std": 0.06304793432354927, "rewards/final_reward": 1.483831413195893, "rewards/mask_iou_reward": 0.7419157065979465, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3149048686027527, "rewards/thk_ans_format_reward": 1.0, "step": 2582, "think_completion_length": 47.34375 }, { "clip_ratio": 0.0, "completion_length": 146.65625, "epoch": 4.36256323777403, "grad_norm": 7.46278891755736, "kl": 0.5078125, "learning_rate": 1.2883642495784146e-07, "loss": 0.0005, "reward": 3.532469868659973, "reward_std": 0.1696995971724391, "rewards/final_reward": 1.5694744475666043, "rewards/mask_iou_reward": 0.7847372237833021, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.532469630241394, "rewards/thk_ans_format_reward": 1.0, "step": 2583, "think_completion_length": 52.8125 }, { "clip_ratio": 0.0, "completion_length": 110.96875, "epoch": 4.36424957841484, "grad_norm": 16.890277065484163, "kl": 0.5859375, "learning_rate": 1.284991568296796e-07, "loss": 0.0006, "reward": 3.673448085784912, "reward_std": 0.03351970575749874, "rewards/final_reward": 1.641488471538079, "rewards/mask_iou_reward": 0.8207442357690395, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6734481453895569, "rewards/thk_ans_format_reward": 1.0, "step": 2584, "think_completion_length": 41.4375 }, { "clip_ratio": 0.0, "completion_length": 110.46875, "epoch": 4.36593591905565, "grad_norm": 21.628494381474855, "kl": 0.64453125, "learning_rate": 1.2816188870151771e-07, "loss": 0.0006, "reward": 3.265789270401001, "reward_std": 0.0905698649585247, "rewards/final_reward": 0.8923392086039716, "rewards/mask_iou_reward": 0.4461696043019858, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2657892405986786, "rewards/thk_ans_format_reward": 1.0, "step": 2585, "think_completion_length": 44.5625 }, { "clip_ratio": 0.0, "completion_length": 110.5625, "epoch": 4.367622259696459, "grad_norm": 10.898855924244831, "kl": 0.607421875, "learning_rate": 1.278246205733558e-07, "loss": 0.0006, "reward": 3.753677248954773, "reward_std": 0.16097071999683976, "rewards/final_reward": 1.9405172550981111, "rewards/mask_iou_reward": 0.9702586275490556, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7536773085594177, "rewards/thk_ans_format_reward": 1.0, "step": 2586, "think_completion_length": 40.34375 }, { "clip_ratio": 0.0, "completion_length": 119.84375, "epoch": 4.369308600337268, "grad_norm": 13.241927572902359, "kl": 0.591796875, "learning_rate": 1.2748735244519394e-07, "loss": 0.0006, "reward": 3.252161383628845, "reward_std": 0.15102306054905057, "rewards/final_reward": 1.6813825703169905, "rewards/mask_iou_reward": 0.8406912851584952, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2521612644195557, "rewards/thk_ans_format_reward": 1.0, "step": 2587, "think_completion_length": 47.125 }, { "clip_ratio": 0.0, "completion_length": 118.03125, "epoch": 4.370994940978077, "grad_norm": 7.9241462399636635, "kl": 0.5625, "learning_rate": 1.2715008431703203e-07, "loss": 0.0006, "reward": 3.655425548553467, "reward_std": 0.17257796972990036, "rewards/final_reward": 1.7522866017704373, "rewards/mask_iou_reward": 0.8761433008852186, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.655425488948822, "rewards/thk_ans_format_reward": 1.0, "step": 2588, "think_completion_length": 42.8125 }, { "clip_ratio": 0.0, "completion_length": 136.078125, "epoch": 4.372681281618887, "grad_norm": 11.05652046882899, "kl": 0.603515625, "learning_rate": 1.2681281618887017e-07, "loss": 0.0006, "reward": 3.748861074447632, "reward_std": 0.09842104464769363, "rewards/final_reward": 1.6676742514306229, "rewards/mask_iou_reward": 0.8338371257153114, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7488611340522766, "rewards/thk_ans_format_reward": 1.0, "step": 2589, "think_completion_length": 45.6875 }, { "clip_ratio": 0.0, "completion_length": 119.796875, "epoch": 4.3743676222596966, "grad_norm": 16.919804718328106, "kl": 0.564453125, "learning_rate": 1.2647554806070826e-07, "loss": 0.0006, "reward": 3.2739087343215942, "reward_std": 0.24324085749685764, "rewards/final_reward": 1.0611457163072324, "rewards/mask_iou_reward": 0.5305728581536162, "rewards/sam_format_reward": 0.9375, "rewards/sam_reward_func_ultra": 1.3364086747169495, "rewards/thk_ans_format_reward": 1.0, "step": 2590, "think_completion_length": 41.03125 }, { "clip_ratio": 0.0, "completion_length": 115.875, "epoch": 4.376053962900506, "grad_norm": 8.859520000216387, "kl": 0.58203125, "learning_rate": 1.2613827993254637e-07, "loss": 0.0006, "reward": 3.1826682090759277, "reward_std": 0.11050739884376526, "rewards/final_reward": 0.630540052892013, "rewards/mask_iou_reward": 0.3152700264460065, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1826683282852173, "rewards/thk_ans_format_reward": 1.0, "step": 2591, "think_completion_length": 50.1875 }, { "clip_ratio": 0.0, "completion_length": 126.921875, "epoch": 4.377740303541315, "grad_norm": 43.66077845280807, "kl": 0.59375, "learning_rate": 1.2580101180438448e-07, "loss": 0.0006, "reward": 3.089726448059082, "reward_std": 0.04616999439895153, "rewards/final_reward": 0.9430638960374498, "rewards/mask_iou_reward": 0.4715319480187249, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.089726448059082, "rewards/thk_ans_format_reward": 1.0, "step": 2592, "think_completion_length": 45.75 }, { "clip_ratio": 0.0, "completion_length": 111.390625, "epoch": 4.379426644182125, "grad_norm": 111.18989282333878, "kl": 0.56640625, "learning_rate": 1.254637436762226e-07, "loss": 0.0006, "reward": 3.495315194129944, "reward_std": 0.041504111140966415, "rewards/final_reward": 1.15804746014589, "rewards/mask_iou_reward": 0.579023730072945, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.495315134525299, "rewards/thk_ans_format_reward": 1.0, "step": 2593, "think_completion_length": 38.65625 }, { "clip_ratio": 0.0, "completion_length": 135.453125, "epoch": 4.381112984822934, "grad_norm": 11.511121528779169, "kl": 0.5390625, "learning_rate": 1.2512647554806069e-07, "loss": 0.0005, "reward": 3.4886449575424194, "reward_std": 0.14284783974289894, "rewards/final_reward": 1.4388147444293953, "rewards/mask_iou_reward": 0.7194073722146976, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4886450171470642, "rewards/thk_ans_format_reward": 1.0, "step": 2594, "think_completion_length": 46.59375 }, { "clip_ratio": 0.0, "completion_length": 113.65625, "epoch": 4.382799325463743, "grad_norm": 6.5032283973923, "kl": 0.560546875, "learning_rate": 1.2478920741989883e-07, "loss": 0.0006, "reward": 3.439482092857361, "reward_std": 0.03333376161754131, "rewards/final_reward": 1.4244483775725096, "rewards/mask_iou_reward": 0.7122241887862548, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4394821524620056, "rewards/thk_ans_format_reward": 1.0, "step": 2595, "think_completion_length": 45.5625 }, { "clip_ratio": 0.0, "completion_length": 113.296875, "epoch": 4.3844856661045535, "grad_norm": 7.433982934696911, "kl": 0.623046875, "learning_rate": 1.2445193929173694e-07, "loss": 0.0006, "reward": 2.9446091651916504, "reward_std": 0.021507996134459972, "rewards/final_reward": 0.7678388857192303, "rewards/mask_iou_reward": 0.38391944285961516, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9446091949939728, "rewards/thk_ans_format_reward": 1.0, "step": 2596, "think_completion_length": 43.1875 }, { "clip_ratio": 0.0, "completion_length": 132.5625, "epoch": 4.386172006745363, "grad_norm": 30.213404739416212, "kl": 0.591796875, "learning_rate": 1.2411467116357503e-07, "loss": 0.0006, "reward": 3.2194563150405884, "reward_std": 0.12624008324928582, "rewards/final_reward": 0.9775399201988229, "rewards/mask_iou_reward": 0.4887699600994114, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2194563150405884, "rewards/thk_ans_format_reward": 1.0, "step": 2597, "think_completion_length": 37.625 }, { "clip_ratio": 0.0, "completion_length": 113.265625, "epoch": 4.387858347386172, "grad_norm": 27.848311657695973, "kl": 0.5859375, "learning_rate": 1.2377740303541314e-07, "loss": 0.0006, "reward": 3.732755422592163, "reward_std": 0.027668212191201746, "rewards/final_reward": 1.6713844987182405, "rewards/mask_iou_reward": 0.8356922493591202, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7327553629875183, "rewards/thk_ans_format_reward": 1.0, "step": 2598, "think_completion_length": 41.96875 }, { "clip_ratio": 0.0, "completion_length": 113.4375, "epoch": 4.389544688026981, "grad_norm": 8.426157508025339, "kl": 0.6171875, "learning_rate": 1.2344013490725125e-07, "loss": 0.0006, "reward": 2.883462429046631, "reward_std": 0.027737990021705627, "rewards/final_reward": 1.1196981285235068, "rewards/mask_iou_reward": 0.5598490642617534, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.8834625482559204, "rewards/thk_ans_format_reward": 1.0, "step": 2599, "think_completion_length": 44.65625 }, { "clip_ratio": 0.0, "completion_length": 127.1875, "epoch": 4.391231028667791, "grad_norm": 12.995200142244789, "kl": 0.607421875, "learning_rate": 1.2310286677908937e-07, "loss": 0.0006, "reward": 3.072115898132324, "reward_std": 0.22114570438861847, "rewards/final_reward": 0.7345745190405786, "rewards/mask_iou_reward": 0.3672872595202893, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0721158385276794, "rewards/thk_ans_format_reward": 1.0, "step": 2600, "think_completion_length": 50.0 }, { "clip_ratio": 0.0, "completion_length": 112.890625, "epoch": 4.3929173693086, "grad_norm": 13.855587724723787, "kl": 0.568359375, "learning_rate": 1.2276559865092748e-07, "loss": 0.0006, "reward": 3.5711183547973633, "reward_std": 0.05036383680999279, "rewards/final_reward": 1.4760295484223664, "rewards/mask_iou_reward": 0.7380147742111832, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5711183547973633, "rewards/thk_ans_format_reward": 1.0, "step": 2601, "think_completion_length": 42.21875 }, { "clip_ratio": 0.0, "completion_length": 108.546875, "epoch": 4.3946037099494095, "grad_norm": 5.755886009335213, "kl": 0.548828125, "learning_rate": 1.224283305227656e-07, "loss": 0.0005, "reward": 3.100886583328247, "reward_std": 0.08746401220560074, "rewards/final_reward": 1.187160076599616, "rewards/mask_iou_reward": 0.593580038299808, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1008866727352142, "rewards/thk_ans_format_reward": 1.0, "step": 2602, "think_completion_length": 40.03125 }, { "clip_ratio": 0.0, "completion_length": 113.578125, "epoch": 4.39629005059022, "grad_norm": 39.64395810103559, "kl": 1.32421875, "learning_rate": 1.220910623946037e-07, "loss": 0.0013, "reward": 3.728816032409668, "reward_std": 0.06418004259467125, "rewards/final_reward": 1.7604291987169278, "rewards/mask_iou_reward": 0.8802145993584639, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7288159728050232, "rewards/thk_ans_format_reward": 1.0, "step": 2603, "think_completion_length": 42.40625 }, { "clip_ratio": 0.0, "completion_length": 153.734375, "epoch": 4.397976391231029, "grad_norm": 9.62940263705133, "kl": 0.533203125, "learning_rate": 1.2175379426644182e-07, "loss": 0.0005, "reward": 3.2988163232803345, "reward_std": 0.12963218614459038, "rewards/final_reward": 1.4517221758943717, "rewards/mask_iou_reward": 0.7258610879471858, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.298816204071045, "rewards/thk_ans_format_reward": 1.0, "step": 2604, "think_completion_length": 40.40625 }, { "clip_ratio": 0.0, "completion_length": 131.296875, "epoch": 4.399662731871838, "grad_norm": 7.593131325449928, "kl": 0.716796875, "learning_rate": 1.214165261382799e-07, "loss": 0.0007, "reward": 3.7163710594177246, "reward_std": 0.08841110952198505, "rewards/final_reward": 1.6647490690087037, "rewards/mask_iou_reward": 0.8323745345043518, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7163708806037903, "rewards/thk_ans_format_reward": 1.0, "step": 2605, "think_completion_length": 47.0 }, { "clip_ratio": 0.0, "completion_length": 109.4375, "epoch": 4.401349072512647, "grad_norm": 5.2326008033600635, "kl": 0.609375, "learning_rate": 1.2107925801011802e-07, "loss": 0.0006, "reward": 2.621520757675171, "reward_std": 0.13888922333717346, "rewards/final_reward": 0.48397687984975585, "rewards/mask_iou_reward": 0.24198843992487792, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.621520608663559, "rewards/thk_ans_format_reward": 1.0, "step": 2606, "think_completion_length": 39.21875 }, { "clip_ratio": 0.0, "completion_length": 120.6875, "epoch": 4.403035413153457, "grad_norm": 6.214061190619599, "kl": 0.626953125, "learning_rate": 1.2074198988195614e-07, "loss": 0.0006, "reward": 3.549259662628174, "reward_std": 0.08032980561256409, "rewards/final_reward": 1.4888046361899203, "rewards/mask_iou_reward": 0.7444023180949602, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5492598414421082, "rewards/thk_ans_format_reward": 1.0, "step": 2607, "think_completion_length": 49.3125 }, { "clip_ratio": 0.0, "completion_length": 114.09375, "epoch": 4.4047217537942664, "grad_norm": 6.938380100370504, "kl": 0.609375, "learning_rate": 1.2040472175379425e-07, "loss": 0.0006, "reward": 3.187752604484558, "reward_std": 0.05385738704353571, "rewards/final_reward": 1.4650717276018062, "rewards/mask_iou_reward": 0.7325358638009031, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1877525746822357, "rewards/thk_ans_format_reward": 1.0, "step": 2608, "think_completion_length": 36.78125 }, { "clip_ratio": 0.0, "completion_length": 116.34375, "epoch": 4.406408094435076, "grad_norm": 40.97108084569082, "kl": 0.609375, "learning_rate": 1.2006745362563237e-07, "loss": 0.0006, "reward": 3.8264540433883667, "reward_std": 0.1715675238519907, "rewards/final_reward": 1.9172816043021799, "rewards/mask_iou_reward": 0.9586408021510899, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.826453983783722, "rewards/thk_ans_format_reward": 1.0, "step": 2609, "think_completion_length": 42.8125 }, { "clip_ratio": 0.0, "completion_length": 111.15625, "epoch": 4.408094435075886, "grad_norm": 8.004434533128602, "kl": 0.548828125, "learning_rate": 1.1973018549747048e-07, "loss": 0.0005, "reward": 2.7611573934555054, "reward_std": 0.22699527442455292, "rewards/final_reward": 0.8863231089658016, "rewards/mask_iou_reward": 0.4431615544829008, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7611575424671173, "rewards/thk_ans_format_reward": 1.0, "step": 2610, "think_completion_length": 41.78125 }, { "clip_ratio": 0.0, "completion_length": 114.65625, "epoch": 4.409780775716695, "grad_norm": 5.520014277334112, "kl": 0.56640625, "learning_rate": 1.193929173693086e-07, "loss": 0.0006, "reward": 3.1015899181365967, "reward_std": 0.11771095357835293, "rewards/final_reward": 0.458460322402633, "rewards/mask_iou_reward": 0.2292301612013165, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.10158970952034, "rewards/thk_ans_format_reward": 1.0, "step": 2611, "think_completion_length": 45.25 }, { "clip_ratio": 0.0, "completion_length": 141.359375, "epoch": 4.411467116357504, "grad_norm": 9.160802093351666, "kl": 0.541015625, "learning_rate": 1.1905564924114671e-07, "loss": 0.0005, "reward": 3.1726194620132446, "reward_std": 0.0722682923078537, "rewards/final_reward": 1.6683353795083833, "rewards/mask_iou_reward": 0.8341676897541916, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.1726195812225342, "rewards/thk_ans_format_reward": 1.0, "step": 2612, "think_completion_length": 48.25 }, { "clip_ratio": 0.0, "completion_length": 131.609375, "epoch": 4.413153456998313, "grad_norm": 9.814321839776376, "kl": 0.59765625, "learning_rate": 1.1871838111298482e-07, "loss": 0.0006, "reward": 3.2006527185440063, "reward_std": 0.4120529443025589, "rewards/final_reward": 1.0384373302250594, "rewards/mask_iou_reward": 0.5192186651125297, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2006526589393616, "rewards/thk_ans_format_reward": 1.0, "step": 2613, "think_completion_length": 42.34375 }, { "clip_ratio": 0.0, "completion_length": 133.453125, "epoch": 4.414839797639123, "grad_norm": 9.044116285812068, "kl": 0.513671875, "learning_rate": 1.1838111298482292e-07, "loss": 0.0005, "reward": 3.4873398542404175, "reward_std": 0.09251206181943417, "rewards/final_reward": 1.3417746076498847, "rewards/mask_iou_reward": 0.6708873038249423, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4873397946357727, "rewards/thk_ans_format_reward": 1.0, "step": 2614, "think_completion_length": 41.375 }, { "clip_ratio": 0.0, "completion_length": 130.6875, "epoch": 4.416526138279933, "grad_norm": 11.461383350366239, "kl": 0.541015625, "learning_rate": 1.1804384485666104e-07, "loss": 0.0005, "reward": 3.5904886722564697, "reward_std": 0.07166932441759855, "rewards/final_reward": 1.625364095836272, "rewards/mask_iou_reward": 0.812682047918136, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5904887318611145, "rewards/thk_ans_format_reward": 1.0, "step": 2615, "think_completion_length": 49.40625 }, { "clip_ratio": 0.0, "completion_length": 97.125, "epoch": 4.418212478920742, "grad_norm": 103.55393362384335, "kl": 0.625, "learning_rate": 1.1770657672849915e-07, "loss": 0.0006, "reward": 3.6286131143569946, "reward_std": 0.06564396899193525, "rewards/final_reward": 1.8305318926860772, "rewards/mask_iou_reward": 0.9152659463430386, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6286131739616394, "rewards/thk_ans_format_reward": 1.0, "step": 2616, "think_completion_length": 39.96875 }, { "clip_ratio": 0.0, "completion_length": 115.59375, "epoch": 4.419898819561552, "grad_norm": 13.1873667863401, "kl": 0.6015625, "learning_rate": 1.1736930860033726e-07, "loss": 0.0006, "reward": 3.421006202697754, "reward_std": 0.32600878179073334, "rewards/final_reward": 1.6724000948973066, "rewards/mask_iou_reward": 0.8362000474486533, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4210062623023987, "rewards/thk_ans_format_reward": 1.0, "step": 2617, "think_completion_length": 44.1875 }, { "clip_ratio": 0.0, "completion_length": 117.265625, "epoch": 4.421585160202361, "grad_norm": 9.256869268758377, "kl": 0.59765625, "learning_rate": 1.1703204047217538e-07, "loss": 0.0006, "reward": 3.6909961700439453, "reward_std": 0.016168599016964436, "rewards/final_reward": 1.5668830821447353, "rewards/mask_iou_reward": 0.7834415410723676, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6909962892532349, "rewards/thk_ans_format_reward": 1.0, "step": 2618, "think_completion_length": 46.84375 }, { "clip_ratio": 0.0, "completion_length": 115.703125, "epoch": 4.42327150084317, "grad_norm": 10.38758359315407, "kl": 0.673828125, "learning_rate": 1.1669477234401348e-07, "loss": 0.0007, "reward": 3.471518635749817, "reward_std": 0.03896564897149801, "rewards/final_reward": 1.2505938336831524, "rewards/mask_iou_reward": 0.6252969168415762, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.471518635749817, "rewards/thk_ans_format_reward": 1.0, "step": 2619, "think_completion_length": 42.34375 }, { "clip_ratio": 0.0, "completion_length": 113.015625, "epoch": 4.424957841483979, "grad_norm": 13.067336110176923, "kl": 0.615234375, "learning_rate": 1.1635750421585159e-07, "loss": 0.0006, "reward": 3.089142322540283, "reward_std": 0.26153238862752914, "rewards/final_reward": 1.465048299164459, "rewards/mask_iou_reward": 0.7325241495822294, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0891424119472504, "rewards/thk_ans_format_reward": 1.0, "step": 2620, "think_completion_length": 44.21875 }, { "clip_ratio": 0.0, "completion_length": 122.78125, "epoch": 4.4266441821247895, "grad_norm": 8.069392480164767, "kl": 0.5390625, "learning_rate": 1.160202360876897e-07, "loss": 0.0005, "reward": 2.5934301614761353, "reward_std": 0.11824558675289154, "rewards/final_reward": 0.417071096471484, "rewards/mask_iou_reward": 0.208535548235742, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5934301316738129, "rewards/thk_ans_format_reward": 1.0, "step": 2621, "think_completion_length": 46.03125 }, { "clip_ratio": 0.0, "completion_length": 123.671875, "epoch": 4.428330522765599, "grad_norm": 8.293984205803936, "kl": 0.548828125, "learning_rate": 1.1568296795952782e-07, "loss": 0.0005, "reward": 2.568539619445801, "reward_std": 0.1302037239074707, "rewards/final_reward": 0.7678146034402551, "rewards/mask_iou_reward": 0.38390730172012755, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.568539634346962, "rewards/thk_ans_format_reward": 1.0, "step": 2622, "think_completion_length": 45.15625 }, { "clip_ratio": 0.0, "completion_length": 114.046875, "epoch": 4.430016863406408, "grad_norm": 17.131262575480264, "kl": 0.5703125, "learning_rate": 1.1534569983136592e-07, "loss": 0.0006, "reward": 3.780028223991394, "reward_std": 0.03920180618297309, "rewards/final_reward": 1.7282832905265988, "rewards/mask_iou_reward": 0.8641416452632994, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.780028223991394, "rewards/thk_ans_format_reward": 1.0, "step": 2623, "think_completion_length": 43.65625 }, { "clip_ratio": 0.0, "completion_length": 190.25, "epoch": 4.431703204047217, "grad_norm": 6.430441929245077, "kl": 0.4765625, "learning_rate": 1.1500843170320403e-07, "loss": 0.0005, "reward": 2.705838918685913, "reward_std": 0.11969677184242755, "rewards/final_reward": 0.5233868673069243, "rewards/mask_iou_reward": 0.26169343365346215, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.7058388888835907, "rewards/thk_ans_format_reward": 1.0, "step": 2624, "think_completion_length": 45.8125 }, { "clip_ratio": 0.0, "completion_length": 112.984375, "epoch": 4.433389544688027, "grad_norm": 31.678806652203008, "kl": 0.650390625, "learning_rate": 1.1467116357504215e-07, "loss": 0.0007, "reward": 3.651994466781616, "reward_std": 0.028817713260650635, "rewards/final_reward": 1.6218849618589297, "rewards/mask_iou_reward": 0.8109424809294649, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.651994526386261, "rewards/thk_ans_format_reward": 1.0, "step": 2625, "think_completion_length": 45.40625 }, { "clip_ratio": 0.0, "completion_length": 118.875, "epoch": 4.435075885328836, "grad_norm": 6.252600044446703, "kl": 0.515625, "learning_rate": 1.1433389544688026e-07, "loss": 0.0005, "reward": 3.4030381441116333, "reward_std": 0.11711015552282333, "rewards/final_reward": 1.3276957990905338, "rewards/mask_iou_reward": 0.6638478995452669, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.403038203716278, "rewards/thk_ans_format_reward": 1.0, "step": 2626, "think_completion_length": 45.46875 }, { "clip_ratio": 0.0, "completion_length": 114.625, "epoch": 4.4367622259696455, "grad_norm": 8.745195964000725, "kl": 0.63671875, "learning_rate": 1.1399662731871836e-07, "loss": 0.0006, "reward": 3.6559841632843018, "reward_std": 0.025955231860280037, "rewards/final_reward": 1.5211647887452338, "rewards/mask_iou_reward": 0.7605823943726169, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6559841632843018, "rewards/thk_ans_format_reward": 1.0, "step": 2627, "think_completion_length": 42.34375 }, { "clip_ratio": 0.0, "completion_length": 127.5, "epoch": 4.438448566610456, "grad_norm": 9.671689509302032, "kl": 0.4755859375, "learning_rate": 1.1365935919055649e-07, "loss": 0.0005, "reward": 3.4869346618652344, "reward_std": 0.11515981703996658, "rewards/final_reward": 1.492222922357928, "rewards/mask_iou_reward": 0.746111461178964, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4869346618652344, "rewards/thk_ans_format_reward": 1.0, "step": 2628, "think_completion_length": 42.5 }, { "clip_ratio": 0.0, "completion_length": 111.34375, "epoch": 4.440134907251265, "grad_norm": 23.11696267408195, "kl": 0.669921875, "learning_rate": 1.133220910623946e-07, "loss": 0.0007, "reward": 3.32826566696167, "reward_std": 0.21583092957735062, "rewards/final_reward": 1.651356880883315, "rewards/mask_iou_reward": 0.8256784404416575, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.32826566696167, "rewards/thk_ans_format_reward": 1.0, "step": 2629, "think_completion_length": 41.0 }, { "clip_ratio": 0.0, "completion_length": 111.96875, "epoch": 4.441821247892074, "grad_norm": 13.057917991823976, "kl": 0.609375, "learning_rate": 1.1298482293423272e-07, "loss": 0.0006, "reward": 3.688571810722351, "reward_std": 0.006875853752717376, "rewards/final_reward": 1.6953000644129448, "rewards/mask_iou_reward": 0.8476500322064724, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6885717511177063, "rewards/thk_ans_format_reward": 1.0, "step": 2630, "think_completion_length": 40.59375 }, { "clip_ratio": 0.0, "completion_length": 116.203125, "epoch": 4.443507588532883, "grad_norm": 8.93951053796139, "kl": 0.60546875, "learning_rate": 1.1264755480607083e-07, "loss": 0.0006, "reward": 3.3038218021392822, "reward_std": 0.06475062295794487, "rewards/final_reward": 0.7706906485427694, "rewards/mask_iou_reward": 0.3853453242713847, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.303821861743927, "rewards/thk_ans_format_reward": 1.0, "step": 2631, "think_completion_length": 47.90625 }, { "clip_ratio": 0.0, "completion_length": 135.4375, "epoch": 4.445193929173693, "grad_norm": 8.98465106731901, "kl": 0.5634765625, "learning_rate": 1.1231028667790893e-07, "loss": 0.0006, "reward": 3.3895570039749146, "reward_std": 0.07609788700938225, "rewards/final_reward": 1.0268260569306906, "rewards/mask_iou_reward": 0.5134130284653453, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3895567655563354, "rewards/thk_ans_format_reward": 1.0, "step": 2632, "think_completion_length": 48.46875 }, { "clip_ratio": 0.0, "completion_length": 137.375, "epoch": 4.4468802698145025, "grad_norm": 8.868591815915336, "kl": 0.603515625, "learning_rate": 1.1197301854974705e-07, "loss": 0.0006, "reward": 3.3486082553863525, "reward_std": 0.04780184803530574, "rewards/final_reward": 1.0677600247327752, "rewards/mask_iou_reward": 0.5338800123663876, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3486083149909973, "rewards/thk_ans_format_reward": 1.0, "step": 2633, "think_completion_length": 41.0625 }, { "clip_ratio": 0.0, "completion_length": 117.65625, "epoch": 4.448566610455312, "grad_norm": 7.070680793502134, "kl": 0.5625, "learning_rate": 1.1163575042158516e-07, "loss": 0.0006, "reward": 3.4986801147460938, "reward_std": 0.19939835742115974, "rewards/final_reward": 1.9185519446167523, "rewards/mask_iou_reward": 0.9592759723083761, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4986801147460938, "rewards/thk_ans_format_reward": 1.0, "step": 2634, "think_completion_length": 46.03125 }, { "clip_ratio": 0.0, "completion_length": 112.4375, "epoch": 4.450252951096122, "grad_norm": 12.153644101177255, "kl": 0.58984375, "learning_rate": 1.1129848229342327e-07, "loss": 0.0006, "reward": 3.038814663887024, "reward_std": 0.20692519284784794, "rewards/final_reward": 1.2421937835092174, "rewards/mask_iou_reward": 0.6210968917546087, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.038814753293991, "rewards/thk_ans_format_reward": 1.0, "step": 2635, "think_completion_length": 42.6875 }, { "clip_ratio": 0.0, "completion_length": 113.125, "epoch": 4.451939291736931, "grad_norm": 10.058118041299805, "kl": 0.603515625, "learning_rate": 1.1096121416526137e-07, "loss": 0.0006, "reward": 3.3655534982681274, "reward_std": 0.04708591848611832, "rewards/final_reward": 1.5861924009273762, "rewards/mask_iou_reward": 0.7930962004636881, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3655535578727722, "rewards/thk_ans_format_reward": 1.0, "step": 2636, "think_completion_length": 41.9375 }, { "clip_ratio": 0.0, "completion_length": 128.71875, "epoch": 4.45362563237774, "grad_norm": 11.354474998702795, "kl": 0.533203125, "learning_rate": 1.1062394603709949e-07, "loss": 0.0005, "reward": 3.6468669176101685, "reward_std": 0.06775764841586351, "rewards/final_reward": 1.629458788075444, "rewards/mask_iou_reward": 0.814729394037722, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6468668580055237, "rewards/thk_ans_format_reward": 1.0, "step": 2637, "think_completion_length": 44.1875 }, { "clip_ratio": 0.0, "completion_length": 115.796875, "epoch": 4.455311973018549, "grad_norm": 6.542571543632283, "kl": 0.59765625, "learning_rate": 1.102866779089376e-07, "loss": 0.0006, "reward": 2.995203733444214, "reward_std": 0.05667147785425186, "rewards/final_reward": 0.824913530327498, "rewards/mask_iou_reward": 0.412456765163749, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9952037483453751, "rewards/thk_ans_format_reward": 1.0, "step": 2638, "think_completion_length": 45.375 }, { "clip_ratio": 0.0, "completion_length": 137.609375, "epoch": 4.456998313659359, "grad_norm": 6.756942011236939, "kl": 0.5390625, "learning_rate": 1.0994940978077572e-07, "loss": 0.0005, "reward": 3.3548476696014404, "reward_std": 0.07482127472758293, "rewards/final_reward": 1.7545954018870025, "rewards/mask_iou_reward": 0.8772977009435012, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.354847490787506, "rewards/thk_ans_format_reward": 1.0, "step": 2639, "think_completion_length": 43.375 }, { "clip_ratio": 0.0, "completion_length": 136.4375, "epoch": 4.458684654300169, "grad_norm": 11.833040497849026, "kl": 0.5185546875, "learning_rate": 1.0961214165261383e-07, "loss": 0.0005, "reward": 3.5759642124176025, "reward_std": 0.027602959889918566, "rewards/final_reward": 1.5843839322703575, "rewards/mask_iou_reward": 0.7921919661351787, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5759642124176025, "rewards/thk_ans_format_reward": 1.0, "step": 2640, "think_completion_length": 40.90625 }, { "clip_ratio": 0.0, "completion_length": 113.859375, "epoch": 4.460370994940978, "grad_norm": 16.623365225933398, "kl": 0.5859375, "learning_rate": 1.0927487352445193e-07, "loss": 0.0005, "reward": 3.6343547105789185, "reward_std": 0.021240360219962895, "rewards/final_reward": 1.642886709337712, "rewards/mask_iou_reward": 0.821443354668856, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6343547701835632, "rewards/thk_ans_format_reward": 1.0, "step": 2641, "think_completion_length": 45.125 }, { "clip_ratio": 0.0, "completion_length": 135.546875, "epoch": 4.462057335581788, "grad_norm": 5.592280894385483, "kl": 0.4931640625, "learning_rate": 1.0893760539629004e-07, "loss": 0.0005, "reward": 3.7818983793258667, "reward_std": 0.11853981949388981, "rewards/final_reward": 1.88099498342845, "rewards/mask_iou_reward": 0.940497491714225, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7818983793258667, "rewards/thk_ans_format_reward": 1.0, "step": 2642, "think_completion_length": 42.21875 }, { "clip_ratio": 0.0, "completion_length": 112.484375, "epoch": 4.463743676222597, "grad_norm": 8.582496126745752, "kl": 0.61328125, "learning_rate": 1.0860033726812816e-07, "loss": 0.0006, "reward": 3.078013300895691, "reward_std": 0.1424650065600872, "rewards/final_reward": 0.8963709606380095, "rewards/mask_iou_reward": 0.44818548031900474, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0780134201049805, "rewards/thk_ans_format_reward": 1.0, "step": 2643, "think_completion_length": 41.65625 }, { "clip_ratio": 0.0, "completion_length": 138.828125, "epoch": 4.465430016863406, "grad_norm": 46.17055861852506, "kl": 0.552734375, "learning_rate": 1.0826306913996627e-07, "loss": 0.0005, "reward": 3.219741702079773, "reward_std": 0.26992932334542274, "rewards/final_reward": 1.4363236266823287, "rewards/mask_iou_reward": 0.7181618133411644, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2197417914867401, "rewards/thk_ans_format_reward": 1.0, "step": 2644, "think_completion_length": 46.25 }, { "clip_ratio": 0.0, "completion_length": 113.859375, "epoch": 4.467116357504215, "grad_norm": 16.039393307939402, "kl": 0.646484375, "learning_rate": 1.0792580101180437e-07, "loss": 0.0007, "reward": 3.00642192363739, "reward_std": 0.0911721233278513, "rewards/final_reward": 1.3042461841878612, "rewards/mask_iou_reward": 0.6521230920939306, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0064219534397125, "rewards/thk_ans_format_reward": 1.0, "step": 2645, "think_completion_length": 44.5 }, { "clip_ratio": 0.0, "completion_length": 118.125, "epoch": 4.4688026981450255, "grad_norm": 14.585010775776698, "kl": 0.5703125, "learning_rate": 1.0758853288364249e-07, "loss": 0.0005, "reward": 3.214327096939087, "reward_std": 0.11271106917411089, "rewards/final_reward": 1.3228343652584247, "rewards/mask_iou_reward": 0.6614171826292123, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2143271565437317, "rewards/thk_ans_format_reward": 1.0, "step": 2646, "think_completion_length": 50.28125 }, { "clip_ratio": 0.0, "completion_length": 159.84375, "epoch": 4.470489038785835, "grad_norm": 11.422126760394363, "kl": 0.609375, "learning_rate": 1.072512647554806e-07, "loss": 0.0006, "reward": 3.3172987699508667, "reward_std": 0.07890355307608843, "rewards/final_reward": 1.827619537052433, "rewards/mask_iou_reward": 0.9138097685262165, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3172988891601562, "rewards/thk_ans_format_reward": 1.0, "step": 2647, "think_completion_length": 44.40625 }, { "clip_ratio": 0.0, "completion_length": 113.765625, "epoch": 4.472175379426644, "grad_norm": 8.19118901068108, "kl": 0.59375, "learning_rate": 1.0691399662731871e-07, "loss": 0.0006, "reward": 3.2954260110855103, "reward_std": 0.14317326247692108, "rewards/final_reward": 1.236711009110651, "rewards/mask_iou_reward": 0.6183555045553255, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2954260110855103, "rewards/thk_ans_format_reward": 1.0, "step": 2648, "think_completion_length": 44.09375 }, { "clip_ratio": 0.0, "completion_length": 128.59375, "epoch": 4.473861720067454, "grad_norm": 7.837971132263195, "kl": 0.5625, "learning_rate": 1.0657672849915683e-07, "loss": 0.0006, "reward": 3.2900350093841553, "reward_std": 0.04385017417371273, "rewards/final_reward": 1.6600767316864327, "rewards/mask_iou_reward": 0.8300383658432163, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.290034830570221, "rewards/thk_ans_format_reward": 1.0, "step": 2649, "think_completion_length": 45.28125 }, { "clip_ratio": 0.0, "completion_length": 113.03125, "epoch": 4.475548060708263, "grad_norm": 6.960078569740148, "kl": 0.619140625, "learning_rate": 1.0623946037099493e-07, "loss": 0.0006, "reward": 3.0506186485290527, "reward_std": 0.11288509517908096, "rewards/final_reward": 0.9843538060173483, "rewards/mask_iou_reward": 0.49217690300867417, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0506186485290527, "rewards/thk_ans_format_reward": 1.0, "step": 2650, "think_completion_length": 45.78125 }, { "clip_ratio": 0.0, "completion_length": 111.6875, "epoch": 4.477234401349072, "grad_norm": 30.315258681626336, "kl": 0.595703125, "learning_rate": 1.0590219224283304e-07, "loss": 0.0006, "reward": 3.734488844871521, "reward_std": 0.028505256865173578, "rewards/final_reward": 1.8352535786063089, "rewards/mask_iou_reward": 0.9176267893031544, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7344887852668762, "rewards/thk_ans_format_reward": 1.0, "step": 2651, "think_completion_length": 41.75 }, { "clip_ratio": 0.0, "completion_length": 112.015625, "epoch": 4.4789207419898815, "grad_norm": 20.307580488842333, "kl": 0.619140625, "learning_rate": 1.0556492411467116e-07, "loss": 0.0006, "reward": 3.286136507987976, "reward_std": 0.12623700872063637, "rewards/final_reward": 1.3470083843749305, "rewards/mask_iou_reward": 0.6735041921874653, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.286136507987976, "rewards/thk_ans_format_reward": 1.0, "step": 2652, "think_completion_length": 41.375 }, { "clip_ratio": 0.0, "completion_length": 128.5, "epoch": 4.480607082630692, "grad_norm": 9.208181074924948, "kl": 0.740234375, "learning_rate": 1.0522765598650927e-07, "loss": 0.0008, "reward": 3.2891019582748413, "reward_std": 0.03242574352771044, "rewards/final_reward": 0.9127468298879465, "rewards/mask_iou_reward": 0.45637341494397327, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2891019880771637, "rewards/thk_ans_format_reward": 1.0, "step": 2653, "think_completion_length": 48.09375 }, { "clip_ratio": 0.0, "completion_length": 148.59375, "epoch": 4.482293423271501, "grad_norm": 10.054836029278828, "kl": 0.5048828125, "learning_rate": 1.0489038785834737e-07, "loss": 0.0005, "reward": 3.685955047607422, "reward_std": 0.015149123733863235, "rewards/final_reward": 1.9014045515848474, "rewards/mask_iou_reward": 0.9507022757924237, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6859549283981323, "rewards/thk_ans_format_reward": 1.0, "step": 2654, "think_completion_length": 41.375 }, { "clip_ratio": 0.0, "completion_length": 132.140625, "epoch": 4.48397976391231, "grad_norm": 10.076922778528303, "kl": 0.513671875, "learning_rate": 1.0455311973018548e-07, "loss": 0.0004, "reward": 3.829998254776001, "reward_std": 0.043199281208217144, "rewards/final_reward": 1.7935038134722294, "rewards/mask_iou_reward": 0.8967519067361147, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.829998254776001, "rewards/thk_ans_format_reward": 1.0, "step": 2655, "think_completion_length": 49.78125 }, { "clip_ratio": 0.0, "completion_length": 127.71875, "epoch": 4.48566610455312, "grad_norm": 14.24780204502807, "kl": 0.619140625, "learning_rate": 1.042158516020236e-07, "loss": 0.0006, "reward": 3.779955506324768, "reward_std": 0.0681462474167347, "rewards/final_reward": 1.9018116411955917, "rewards/mask_iou_reward": 0.9509058205977958, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7799555659294128, "rewards/thk_ans_format_reward": 1.0, "step": 2656, "think_completion_length": 44.09375 }, { "clip_ratio": 0.0, "completion_length": 133.578125, "epoch": 4.487352445193929, "grad_norm": 9.939901597778025, "kl": 0.7890625, "learning_rate": 1.0387858347386173e-07, "loss": 0.0008, "reward": 3.632006525993347, "reward_std": 0.016680723056197166, "rewards/final_reward": 1.846947845788642, "rewards/mask_iou_reward": 0.923473922894321, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.632006585597992, "rewards/thk_ans_format_reward": 1.0, "step": 2657, "think_completion_length": 42.03125 }, { "clip_ratio": 0.0, "completion_length": 115.171875, "epoch": 4.4890387858347385, "grad_norm": 5.408654799689108, "kl": 0.591796875, "learning_rate": 1.0354131534569983e-07, "loss": 0.0006, "reward": 3.5690382719039917, "reward_std": 0.029328839387744665, "rewards/final_reward": 1.3267483429997529, "rewards/mask_iou_reward": 0.6633741714998764, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5690380930900574, "rewards/thk_ans_format_reward": 1.0, "step": 2658, "think_completion_length": 44.28125 }, { "clip_ratio": 0.0, "completion_length": 113.0, "epoch": 4.490725126475548, "grad_norm": 9.187404635678938, "kl": 0.642578125, "learning_rate": 1.0320404721753794e-07, "loss": 0.0007, "reward": 3.5022149085998535, "reward_std": 0.025782881304621696, "rewards/final_reward": 1.1732110903545854, "rewards/mask_iou_reward": 0.5866055451772927, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5022149085998535, "rewards/thk_ans_format_reward": 1.0, "step": 2659, "think_completion_length": 42.21875 }, { "clip_ratio": 0.0, "completion_length": 154.8125, "epoch": 4.492411467116358, "grad_norm": 5.49395932994343, "kl": 0.515625, "learning_rate": 1.0286677908937605e-07, "loss": 0.0005, "reward": 3.5852386951446533, "reward_std": 0.09690108336508274, "rewards/final_reward": 1.3697257018342164, "rewards/mask_iou_reward": 0.6848628509171082, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5852386355400085, "rewards/thk_ans_format_reward": 1.0, "step": 2660, "think_completion_length": 43.9375 }, { "clip_ratio": 0.0, "completion_length": 111.40625, "epoch": 4.494097807757167, "grad_norm": 5.243002984426864, "kl": 0.568359375, "learning_rate": 1.0252951096121417e-07, "loss": 0.0006, "reward": 3.3164455890655518, "reward_std": 0.07848885655403137, "rewards/final_reward": 0.8967024527831223, "rewards/mask_iou_reward": 0.44835122639156116, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.316445529460907, "rewards/thk_ans_format_reward": 1.0, "step": 2661, "think_completion_length": 39.40625 }, { "clip_ratio": 0.0, "completion_length": 185.296875, "epoch": 4.495784148397976, "grad_norm": 3.473115692534412, "kl": 0.51171875, "learning_rate": 1.0219224283305228e-07, "loss": 0.0005, "reward": 3.041918635368347, "reward_std": 0.19841172359883785, "rewards/final_reward": 1.1676012257269686, "rewards/mask_iou_reward": 0.5838006128634843, "rewards/sam_format_reward": 0.96875, "rewards/sam_reward_func_ultra": 1.073168694972992, "rewards/thk_ans_format_reward": 1.0, "step": 2662, "think_completion_length": 42.4375 }, { "clip_ratio": 0.0, "completion_length": 134.03125, "epoch": 4.497470489038786, "grad_norm": 13.879925985773543, "kl": 0.5068359375, "learning_rate": 1.0185497470489038e-07, "loss": 0.0005, "reward": 3.432920813560486, "reward_std": 0.31590735912323, "rewards/final_reward": 1.1663536039431972, "rewards/mask_iou_reward": 0.5831768019715986, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.432920753955841, "rewards/thk_ans_format_reward": 1.0, "step": 2663, "think_completion_length": 45.0625 }, { "clip_ratio": 0.0, "completion_length": 138.15625, "epoch": 4.499156829679595, "grad_norm": 7.630152597686654, "kl": 0.5390625, "learning_rate": 1.015177065767285e-07, "loss": 0.0005, "reward": 3.535985827445984, "reward_std": 0.21657665446400642, "rewards/final_reward": 1.5852197319364074, "rewards/mask_iou_reward": 0.7926098659682037, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5359859466552734, "rewards/thk_ans_format_reward": 1.0, "step": 2664, "think_completion_length": 42.4375 }, { "clip_ratio": 0.0, "completion_length": 112.5, "epoch": 4.500843170320405, "grad_norm": 24.542373673707836, "kl": 0.61328125, "learning_rate": 1.0118043844856661e-07, "loss": 0.0006, "reward": 3.67924702167511, "reward_std": 0.06537250243127346, "rewards/final_reward": 1.5337015185011438, "rewards/mask_iou_reward": 0.7668507592505719, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6792468428611755, "rewards/thk_ans_format_reward": 1.0, "step": 2665, "think_completion_length": 42.96875 }, { "clip_ratio": 0.0, "completion_length": 112.5, "epoch": 4.502529510961214, "grad_norm": 8.88138998484568, "kl": 0.62109375, "learning_rate": 1.0084317032040472e-07, "loss": 0.0007, "reward": 3.5684311389923096, "reward_std": 0.12402192875742912, "rewards/final_reward": 1.1883733673027432, "rewards/mask_iou_reward": 0.5941866836513716, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5684311389923096, "rewards/thk_ans_format_reward": 1.0, "step": 2666, "think_completion_length": 43.5625 }, { "clip_ratio": 0.0, "completion_length": 146.28125, "epoch": 4.504215851602024, "grad_norm": 5.665710067122564, "kl": 0.537109375, "learning_rate": 1.0050590219224282e-07, "loss": 0.0005, "reward": 3.340713143348694, "reward_std": 0.08154628158081323, "rewards/final_reward": 1.9216926105581993, "rewards/mask_iou_reward": 0.9608463052790996, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3407131731510162, "rewards/thk_ans_format_reward": 1.0, "step": 2667, "think_completion_length": 40.4375 }, { "clip_ratio": 0.0, "completion_length": 140.546875, "epoch": 4.505902192242833, "grad_norm": 13.118503767313488, "kl": 0.544921875, "learning_rate": 1.0016863406408094e-07, "loss": 0.0005, "reward": 3.4577395915985107, "reward_std": 0.10068205185234547, "rewards/final_reward": 1.1926356698317773, "rewards/mask_iou_reward": 0.5963178349158886, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4577394127845764, "rewards/thk_ans_format_reward": 1.0, "step": 2668, "think_completion_length": 38.84375 }, { "clip_ratio": 0.0, "completion_length": 124.421875, "epoch": 4.507588532883642, "grad_norm": 10.438658465372848, "kl": 0.5458984375, "learning_rate": 9.983136593591905e-08, "loss": 0.0005, "reward": 3.6971330642700195, "reward_std": 0.05630340613424778, "rewards/final_reward": 1.75558580296901, "rewards/mask_iou_reward": 0.877792901484505, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.69713294506073, "rewards/thk_ans_format_reward": 1.0, "step": 2669, "think_completion_length": 39.84375 }, { "clip_ratio": 0.0, "completion_length": 108.5, "epoch": 4.509274873524452, "grad_norm": 7.884218664723201, "kl": 0.6171875, "learning_rate": 9.949409780775717e-08, "loss": 0.0006, "reward": 3.3458492755889893, "reward_std": 0.31212668120861053, "rewards/final_reward": 0.873962773999293, "rewards/mask_iou_reward": 0.4369813869996465, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3458492755889893, "rewards/thk_ans_format_reward": 1.0, "step": 2670, "think_completion_length": 42.46875 }, { "clip_ratio": 0.0, "completion_length": 179.28125, "epoch": 4.5109612141652615, "grad_norm": 9.23283295335352, "kl": 0.611328125, "learning_rate": 9.915682967959528e-08, "loss": 0.0006, "reward": 3.2621822357177734, "reward_std": 0.4180988222360611, "rewards/final_reward": 1.569866105542606, "rewards/mask_iou_reward": 0.784933052771303, "rewards/sam_format_reward": 0.984375, "rewards/sam_reward_func_ultra": 1.293432354927063, "rewards/thk_ans_format_reward": 0.984375, "step": 2671, "think_completion_length": 37.65625 }, { "clip_ratio": 0.0, "completion_length": 143.875, "epoch": 4.512647554806071, "grad_norm": 19.30137884206897, "kl": 0.599609375, "learning_rate": 9.881956155143338e-08, "loss": 0.0006, "reward": 3.455308437347412, "reward_std": 0.13220026344060898, "rewards/final_reward": 1.458789095807849, "rewards/mask_iou_reward": 0.7293945479039246, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4553083777427673, "rewards/thk_ans_format_reward": 1.0, "step": 2672, "think_completion_length": 44.125 }, { "clip_ratio": 0.0, "completion_length": 109.078125, "epoch": 4.51433389544688, "grad_norm": 8.661303822832371, "kl": 0.61328125, "learning_rate": 9.84822934232715e-08, "loss": 0.0006, "reward": 2.5912883281707764, "reward_std": 0.16992703033611178, "rewards/final_reward": 0.321427037456387, "rewards/mask_iou_reward": 0.1607135187281935, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.5912883952260017, "rewards/thk_ans_format_reward": 1.0, "step": 2673, "think_completion_length": 38.03125 }, { "clip_ratio": 0.0, "completion_length": 115.390625, "epoch": 4.51602023608769, "grad_norm": 9.472047529229577, "kl": 0.623046875, "learning_rate": 9.814502529510961e-08, "loss": 0.0007, "reward": 3.272798538208008, "reward_std": 0.05844417680054903, "rewards/final_reward": 1.0032500490354634, "rewards/mask_iou_reward": 0.5016250245177317, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2727984189987183, "rewards/thk_ans_format_reward": 1.0, "step": 2674, "think_completion_length": 47.59375 }, { "clip_ratio": 0.0, "completion_length": 117.4375, "epoch": 4.517706576728499, "grad_norm": 7.799498451545803, "kl": 0.55078125, "learning_rate": 9.780775716694772e-08, "loss": 0.0006, "reward": 3.3780055046081543, "reward_std": 0.24958141800016165, "rewards/final_reward": 1.1998570376285973, "rewards/mask_iou_reward": 0.5999285188142987, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3780055046081543, "rewards/thk_ans_format_reward": 1.0, "step": 2675, "think_completion_length": 47.6875 }, { "clip_ratio": 0.0, "completion_length": 113.890625, "epoch": 4.519392917369308, "grad_norm": 16.83449299493527, "kl": 0.59375, "learning_rate": 9.747048903878582e-08, "loss": 0.0006, "reward": 3.4879668951034546, "reward_std": 0.17732420563697815, "rewards/final_reward": 1.0509339859279632, "rewards/mask_iou_reward": 0.5254669929639816, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.487967073917389, "rewards/thk_ans_format_reward": 1.0, "step": 2676, "think_completion_length": 45.0 }, { "clip_ratio": 0.0, "completion_length": 138.609375, "epoch": 4.5210792580101185, "grad_norm": 10.81027860870198, "kl": 0.59375, "learning_rate": 9.713322091062394e-08, "loss": 0.0006, "reward": 3.491172671318054, "reward_std": 0.08230869006365538, "rewards/final_reward": 1.9163851028379348, "rewards/mask_iou_reward": 0.9581925514189674, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4911725521087646, "rewards/thk_ans_format_reward": 1.0, "step": 2677, "think_completion_length": 43.96875 }, { "clip_ratio": 0.0, "completion_length": 112.140625, "epoch": 4.522765598650928, "grad_norm": 16.38177906884117, "kl": 0.626953125, "learning_rate": 9.679595278246205e-08, "loss": 0.0006, "reward": 3.313094973564148, "reward_std": 0.043430982856079936, "rewards/final_reward": 1.412640101241867, "rewards/mask_iou_reward": 0.7063200506209335, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3130950927734375, "rewards/thk_ans_format_reward": 1.0, "step": 2678, "think_completion_length": 41.78125 }, { "clip_ratio": 0.0, "completion_length": 154.71875, "epoch": 4.524451939291737, "grad_norm": 55.652422746933475, "kl": 0.5439453125, "learning_rate": 9.645868465430016e-08, "loss": 0.0005, "reward": 3.000455379486084, "reward_std": 0.12246969155967236, "rewards/final_reward": 0.7351268397904934, "rewards/mask_iou_reward": 0.3675634198952467, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0004554092884064, "rewards/thk_ans_format_reward": 1.0, "step": 2679, "think_completion_length": 42.625 }, { "clip_ratio": 0.0, "completion_length": 137.46875, "epoch": 4.526138279932546, "grad_norm": 16.0943834709773, "kl": 0.576171875, "learning_rate": 9.612141652613827e-08, "loss": 0.0006, "reward": 3.597416639328003, "reward_std": 0.1382383331656456, "rewards/final_reward": 1.431301174343287, "rewards/mask_iou_reward": 0.7156505871716435, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5974166989326477, "rewards/thk_ans_format_reward": 1.0, "step": 2680, "think_completion_length": 44.125 }, { "clip_ratio": 0.0, "completion_length": 127.796875, "epoch": 4.527824620573356, "grad_norm": 46.47438098712813, "kl": 0.58203125, "learning_rate": 9.578414839797638e-08, "loss": 0.0006, "reward": 3.5468854904174805, "reward_std": 0.06338476575911045, "rewards/final_reward": 1.5643748779855864, "rewards/mask_iou_reward": 0.7821874389927932, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5468855500221252, "rewards/thk_ans_format_reward": 1.0, "step": 2681, "think_completion_length": 47.34375 }, { "clip_ratio": 0.0, "completion_length": 159.234375, "epoch": 4.529510961214165, "grad_norm": 65.56906342241375, "kl": 0.501953125, "learning_rate": 9.544688026981449e-08, "loss": 0.0005, "reward": 3.229214310646057, "reward_std": 0.045433159917593, "rewards/final_reward": 0.9161283390685623, "rewards/mask_iou_reward": 0.45806416953428114, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.229214370250702, "rewards/thk_ans_format_reward": 1.0, "step": 2682, "think_completion_length": 44.0 }, { "clip_ratio": 0.0, "completion_length": 112.8125, "epoch": 4.5311973018549745, "grad_norm": 6.621407437686109, "kl": 0.6328125, "learning_rate": 9.510961214165261e-08, "loss": 0.0006, "reward": 3.581550717353821, "reward_std": 0.004469448467716575, "rewards/final_reward": 1.2020244633899486, "rewards/mask_iou_reward": 0.6010122316949743, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5815508365631104, "rewards/thk_ans_format_reward": 1.0, "step": 2683, "think_completion_length": 42.9375 }, { "clip_ratio": 0.0, "completion_length": 186.96875, "epoch": 4.532883642495785, "grad_norm": 5.598185861981018, "kl": 0.525390625, "learning_rate": 9.477234401349072e-08, "loss": 0.0005, "reward": 3.36284863948822, "reward_std": 0.1877284124493599, "rewards/final_reward": 1.7833434425748234, "rewards/mask_iou_reward": 0.8916717212874117, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3628486394882202, "rewards/thk_ans_format_reward": 1.0, "step": 2684, "think_completion_length": 39.9375 }, { "clip_ratio": 0.0, "completion_length": 127.515625, "epoch": 4.534569983136594, "grad_norm": 13.05266193699777, "kl": 0.5517578125, "learning_rate": 9.443507588532882e-08, "loss": 0.0006, "reward": 3.418926954269409, "reward_std": 0.12092643231153488, "rewards/final_reward": 1.430031399150564, "rewards/mask_iou_reward": 0.715015699575282, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.418927013874054, "rewards/thk_ans_format_reward": 1.0, "step": 2685, "think_completion_length": 43.4375 }, { "clip_ratio": 0.0, "completion_length": 113.09375, "epoch": 4.536256323777403, "grad_norm": 9.639271862123636, "kl": 0.609375, "learning_rate": 9.409780775716694e-08, "loss": 0.0006, "reward": 3.740368962287903, "reward_std": 0.021685122046619654, "rewards/final_reward": 1.768809005736757, "rewards/mask_iou_reward": 0.8844045028683785, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.7403690218925476, "rewards/thk_ans_format_reward": 1.0, "step": 2686, "think_completion_length": 45.59375 }, { "clip_ratio": 0.0, "completion_length": 114.875, "epoch": 4.537942664418212, "grad_norm": 8.237652072741113, "kl": 0.580078125, "learning_rate": 9.376053962900506e-08, "loss": 0.0006, "reward": 3.6791892051696777, "reward_std": 0.009476853301748633, "rewards/final_reward": 1.927470946512023, "rewards/mask_iou_reward": 0.9637354732560115, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.679189383983612, "rewards/thk_ans_format_reward": 1.0, "step": 2687, "think_completion_length": 42.8125 }, { "clip_ratio": 0.0, "completion_length": 122.9375, "epoch": 4.539629005059022, "grad_norm": 8.646826214633027, "kl": 0.55078125, "learning_rate": 9.342327150084318e-08, "loss": 0.0006, "reward": 3.4965227842330933, "reward_std": 0.17469704151153564, "rewards/final_reward": 1.4711453506980399, "rewards/mask_iou_reward": 0.7355726753490199, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.4965226650238037, "rewards/thk_ans_format_reward": 1.0, "step": 2688, "think_completion_length": 43.4375 }, { "clip_ratio": 0.0, "completion_length": 114.5625, "epoch": 4.541315345699831, "grad_norm": 10.657778008562978, "kl": 0.6015625, "learning_rate": 9.308600337268128e-08, "loss": 0.0006, "reward": 3.8605817556381226, "reward_std": 0.020185125060379505, "rewards/final_reward": 1.823982382216586, "rewards/mask_iou_reward": 0.911991191108293, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8605817556381226, "rewards/thk_ans_format_reward": 1.0, "step": 2689, "think_completion_length": 42.15625 }, { "clip_ratio": 0.0, "completion_length": 115.859375, "epoch": 4.543001686340641, "grad_norm": 9.243340633120326, "kl": 0.720703125, "learning_rate": 9.274873524451939e-08, "loss": 0.0007, "reward": 3.2710307836532593, "reward_std": 0.04928067233413458, "rewards/final_reward": 1.2541776348164744, "rewards/mask_iou_reward": 0.6270888174082372, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2710306346416473, "rewards/thk_ans_format_reward": 1.0, "step": 2690, "think_completion_length": 45.40625 }, { "clip_ratio": 0.0, "completion_length": 114.28125, "epoch": 4.544688026981451, "grad_norm": 7.80711729689743, "kl": 0.533203125, "learning_rate": 9.24114671163575e-08, "loss": 0.0005, "reward": 3.895188093185425, "reward_std": 0.013354545866604894, "rewards/final_reward": 1.987251964311651, "rewards/mask_iou_reward": 0.9936259821558255, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.8951881527900696, "rewards/thk_ans_format_reward": 1.0, "step": 2691, "think_completion_length": 43.34375 }, { "clip_ratio": 0.0, "completion_length": 115.578125, "epoch": 4.54637436762226, "grad_norm": 46.521774713314045, "kl": 0.59765625, "learning_rate": 9.207419898819562e-08, "loss": 0.0006, "reward": 3.3578141927719116, "reward_std": 0.13790087588131428, "rewards/final_reward": 1.3110319838483524, "rewards/mask_iou_reward": 0.6555159919241762, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3578141927719116, "rewards/thk_ans_format_reward": 1.0, "step": 2692, "think_completion_length": 42.3125 }, { "clip_ratio": 0.0, "completion_length": 114.65625, "epoch": 4.548060708263069, "grad_norm": 12.210287722551254, "kl": 0.642578125, "learning_rate": 9.173693086003373e-08, "loss": 0.0006, "reward": 3.5226752758026123, "reward_std": 0.03340075630694628, "rewards/final_reward": 1.2770168157985262, "rewards/mask_iou_reward": 0.6385084078992631, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.5226754546165466, "rewards/thk_ans_format_reward": 1.0, "step": 2693, "think_completion_length": 43.21875 }, { "clip_ratio": 0.0, "completion_length": 124.796875, "epoch": 4.549747048903878, "grad_norm": 5.288177156407879, "kl": 0.623046875, "learning_rate": 9.139966273187183e-08, "loss": 0.0006, "reward": 3.1990665197372437, "reward_std": 0.0531660639680922, "rewards/final_reward": 1.2057744653670666, "rewards/mask_iou_reward": 0.6028872326835333, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.199066400527954, "rewards/thk_ans_format_reward": 1.0, "step": 2694, "think_completion_length": 40.8125 }, { "clip_ratio": 0.0, "completion_length": 109.734375, "epoch": 4.551433389544688, "grad_norm": 17.94271726562434, "kl": 0.578125, "learning_rate": 9.106239460370995e-08, "loss": 0.0006, "reward": 3.2866199016571045, "reward_std": 0.06423737155273557, "rewards/final_reward": 1.3763903356116252, "rewards/mask_iou_reward": 0.6881951678058126, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.2866200506687164, "rewards/thk_ans_format_reward": 1.0, "step": 2695, "think_completion_length": 40.46875 }, { "clip_ratio": 0.0, "completion_length": 120.4375, "epoch": 4.5531197301854975, "grad_norm": 24.753576545821314, "kl": 0.5546875, "learning_rate": 9.072512647554806e-08, "loss": 0.0006, "reward": 3.3977984189987183, "reward_std": 0.14045307040214539, "rewards/final_reward": 1.4333959971150594, "rewards/mask_iou_reward": 0.7166979985575297, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.3977983593940735, "rewards/thk_ans_format_reward": 1.0, "step": 2696, "think_completion_length": 44.875 }, { "clip_ratio": 0.0, "completion_length": 138.78125, "epoch": 4.554806070826307, "grad_norm": 6.709423049230818, "kl": 0.521484375, "learning_rate": 9.038785834738617e-08, "loss": 0.0005, "reward": 2.9518043994903564, "reward_std": 0.07662581279873848, "rewards/final_reward": 0.9217370347905265, "rewards/mask_iou_reward": 0.46086851739526324, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 0.9518044590950012, "rewards/thk_ans_format_reward": 1.0, "step": 2697, "think_completion_length": 43.25 }, { "clip_ratio": 0.0, "completion_length": 114.078125, "epoch": 4.556492411467117, "grad_norm": 36.19834516157221, "kl": 0.60546875, "learning_rate": 9.005059021922427e-08, "loss": 0.0006, "reward": 3.086367964744568, "reward_std": 0.08483387529850006, "rewards/final_reward": 1.4563193932476, "rewards/mask_iou_reward": 0.7281596966238, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.0863679647445679, "rewards/thk_ans_format_reward": 1.0, "step": 2698, "think_completion_length": 46.875 }, { "clip_ratio": 0.0, "completion_length": 112.640625, "epoch": 4.558178752107926, "grad_norm": 5.445666177713425, "kl": 0.560546875, "learning_rate": 8.971332209106239e-08, "loss": 0.0006, "reward": 3.633660316467285, "reward_std": 0.08000330440700054, "rewards/final_reward": 1.769485983974965, "rewards/mask_iou_reward": 0.8847429919874825, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6336602568626404, "rewards/thk_ans_format_reward": 1.0, "step": 2699, "think_completion_length": 39.46875 }, { "clip_ratio": 0.0, "completion_length": 109.875, "epoch": 4.559865092748735, "grad_norm": 10.187202355665686, "kl": 0.626953125, "learning_rate": 8.93760539629005e-08, "loss": 0.0005, "reward": 3.619858741760254, "reward_std": 0.015684593934565783, "rewards/final_reward": 1.3947394514966747, "rewards/mask_iou_reward": 0.6973697257483373, "rewards/sam_format_reward": 1.0, "rewards/sam_reward_func_ultra": 1.6198588013648987, "rewards/thk_ans_format_reward": 1.0, "step": 2700, "think_completion_length": 41.34375 } ], "logging_steps": 1.0, "max_steps": 2965, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }