Upload folder using huggingface_hub

0ab9351 verified 21 days ago

11.5 kB


	"""
	SAM2 Click Refinement RL Environment
	"""

	import gymnasium as gym
	from gymnasium import spaces
	import numpy as np
	import torch
	from PIL import Image
	from scipy.ndimage import distance_transform_edt


	def compute_dice(pred_mask, gt_mask):
	pred = pred_mask.astype(bool)
	gt = gt_mask.astype(bool)
	intersection = (pred & gt).sum()
	total = pred.sum() + gt.sum()
	if total == 0:
	return 1.0 if intersection == 0 else 0.0
	return float(2 * intersection / (total + 1e-8))


	def compute_iou(pred_mask, gt_mask):
	pred = pred_mask.astype(bool)
	gt = gt_mask.astype(bool)
	intersection = (pred & gt).sum()
	union = (pred \| gt).sum()
	if union == 0:
	return 1.0
	return float(intersection / (union + 1e-8))


	def oracle_click(pred_mask, gt_mask, noise_range=3):
	pred = pred_mask.astype(bool)
	gt = gt_mask.astype(bool)

	fn_mask = (~pred) & gt
	fp_mask = pred & (~gt)

	fn_dist = distance_transform_edt(fn_mask) if fn_mask.any() else np.zeros_like(pred_mask, dtype=float)
	fp_dist = distance_transform_edt(fp_mask) if fp_mask.any() else np.zeros_like(pred_mask, dtype=float)

	if fn_dist.max() >= fp_dist.max() and fn_mask.any():
	click_pos = np.unravel_index(fn_dist.argmax(), fn_dist.shape)
	label = 1
	elif fp_mask.any():
	click_pos = np.unravel_index(fp_dist.argmax(), fp_dist.shape)
	label = 0
	else:
	if gt.any():
	gt_dist = distance_transform_edt(gt)
	click_pos = np.unravel_index(gt_dist.argmax(), gt_dist.shape)
	label = 1
	else:
	click_pos = (pred_mask.shape[0] // 2, pred_mask.shape[1] // 2)
	label = 0

	if noise_range > 0:
	noise = np.random.randint(-noise_range, noise_range + 1, size=2)
	click_pos = (
	int(np.clip(click_pos[0] + noise[0], 0, pred_mask.shape[0] - 1)),
	int(np.clip(click_pos[1] + noise[1], 0, pred_mask.shape[1] - 1)),
	)

	return click_pos[0], click_pos[1], label


	def binary_erosion_safe(mask):
	from scipy.ndimage import binary_erosion
	if not mask.any():
	return mask
	result = binary_erosion(mask, iterations=2)
	if not result.any():
	return mask
	return result


	class SAM2ClickEnv(gym.Env):
	metadata = {"render_modes": ["rgb_array"]}

	def __init__(self, dataset=None, sam_predictor=None, obs_size=128, grid_size=32,
	max_clicks=5, click_radius=3, boundary_reward_weight=0.3,
	initial_click_noise=5, use_sam=True, split="train", render_mode=None):
	super().__init__()
	self.dataset = dataset
	self.sam_predictor = sam_predictor
	self.obs_size = obs_size
	self.grid_size = grid_size
	self.max_clicks = max_clicks
	self.click_radius = click_radius
	self.boundary_reward_weight = boundary_reward_weight
	self.initial_click_noise = initial_click_noise
	self.use_sam = use_sam
	self.render_mode = render_mode

	self.observation_space = spaces.Box(low=0, high=255, shape=(obs_size, obs_size, 6), dtype=np.uint8)
	self.action_space = spaces.Discrete(grid_size * grid_size * 2)

	self.current_image = None
	self.current_gt = None
	self.current_mask = None
	self.click_coords = []
	self.click_labels = []
	self.prev_dice = 0.0
	self.n_clicks = 0
	self.orig_h = 0
	self.orig_w = 0
	self._dataset_indices = None
	self._current_idx = 0

	def _load_random_sample(self):
	if self._dataset_indices is None:
	self._dataset_indices = np.random.permutation(len(self.dataset))
	self._current_idx = 0

	idx = int(self._dataset_indices[self._current_idx])
	self._current_idx = (self._current_idx + 1) % len(self._dataset_indices)
	if self._current_idx == 0:
	self._dataset_indices = np.random.permutation(len(self.dataset))

	sample = self.dataset[idx]
	image = sample["image"]
	mask = sample["mask"]

	if isinstance(image, Image.Image):
	image = image.convert("RGB")
	image = np.array(image)
	if isinstance(mask, Image.Image):
	mask = mask.convert("L")
	mask = np.array(mask)

	mask = (mask > 127).astype(np.uint8)
	return image, mask

	def _resize_for_obs(self, img, is_mask=False):
	pil_img = Image.fromarray(img)
	if is_mask:
	pil_img = pil_img.resize((self.obs_size, self.obs_size), Image.NEAREST)
	else:
	pil_img = pil_img.resize((self.obs_size, self.obs_size), Image.BILINEAR)
	return np.array(pil_img)

	def _make_click_heatmap(self, clicks_yx, orig_h, orig_w):
	heatmap = np.zeros((self.obs_size, self.obs_size), dtype=np.uint8)
	for (y, x) in clicks_yx:
	obs_y = int(y * self.obs_size / orig_h)
	obs_x = int(x * self.obs_size / orig_w)
	obs_y = np.clip(obs_y, 0, self.obs_size - 1)
	obs_x = np.clip(obs_x, 0, self.obs_size - 1)
	for dy in range(-self.click_radius, self.click_radius+1):
	for dx in range(-self.click_radius, self.click_radius+1):
	if dy2 + dx2 <= self.click_radius**2:
	ny, nx = obs_y + dy, obs_x + dx
	if 0 <= ny < self.obs_size and 0 <= nx < self.obs_size:
	heatmap[ny, nx] = 255
	return heatmap

	def _get_obs(self):
	img_resized = self._resize_for_obs(self.current_image)
	mask_resized = self._resize_for_obs((self.current_mask * 255).astype(np.uint8), is_mask=True)

	fg_yx = [(y, x) for (x, y), l in zip(self.click_coords, self.click_labels) if l == 1]
	bg_yx = [(y, x) for (x, y), l in zip(self.click_coords, self.click_labels) if l == 0]

	fg_heatmap = self._make_click_heatmap(fg_yx, self.orig_h, self.orig_w)
	bg_heatmap = self._make_click_heatmap(bg_yx, self.orig_h, self.orig_w)

	obs = np.stack([
	img_resized[:, :, 0], img_resized[:, :, 1], img_resized[:, :, 2],
	mask_resized, fg_heatmap, bg_heatmap,
	], axis=-1).astype(np.uint8)
	return obs

	def _run_sam(self):
	if not self.use_sam or self.sam_predictor is None:
	return self._simulate_mask()

	coords = np.array(self.click_coords, dtype=np.float32)
	labels = np.array(self.click_labels, dtype=np.int32)

	device = "cuda" if torch.cuda.is_available() else "cpu"
	ctx = torch.autocast(device, dtype=torch.bfloat16) if device == "cuda" else torch.inference_mode()
	with torch.inference_mode(), ctx:
	masks, scores, logits = self.sam_predictor.predict(
	point_coords=coords, point_labels=labels,
	multimask_output=(len(self.click_coords) == 1),
	)

	if len(masks.shape) == 3 and masks.shape[0] > 1:
	best_idx = np.argmax(scores)
	mask = masks[best_idx]
	else:
	mask = masks[0] if len(masks.shape) == 3 else masks

	return mask.astype(np.uint8)

	def _simulate_mask(self):
	if not hasattr(self, '_noise_mask'):
	noise = np.random.random(self.current_gt.shape) < 0.15
	self._noise_mask = self.current_gt.copy()
	from scipy.ndimage import binary_dilation, binary_erosion
	if np.random.random() < 0.5:
	self._noise_mask = binary_dilation(self._noise_mask, np.ones((7,7))).astype(np.uint8)
	else:
	self._noise_mask = binary_erosion(self._noise_mask, np.ones((5,5))).astype(np.uint8)
	self._noise_mask = (self._noise_mask ^ noise.astype(np.uint8)).astype(np.uint8)

	mask = self._noise_mask.copy()
	for (x, y), label in zip(self.click_coords, self.click_labels):
	radius = 20
	y0, y1 = max(0, int(y)-radius), min(mask.shape[0], int(y)+radius)
	x0, x1 = max(0, int(x)-radius), min(mask.shape[1], int(x)+radius)
	mask[y0:y1, x0:x1] = self.current_gt[y0:y1, x0:x1]
	return mask

	def reset(self, seed=None, options=None):
	super().reset(seed=seed)
	self.current_image, self.current_gt = self._load_random_sample()
	self.orig_h, self.orig_w = self.current_image.shape[:2]
	self.click_coords = []
	self.click_labels = []
	self.n_clicks = 0
	if hasattr(self, '_noise_mask'):
	del self._noise_mask

	if self.use_sam and self.sam_predictor is not None:
	device = "cuda" if torch.cuda.is_available() else "cpu"
	ctx = torch.autocast(device, dtype=torch.bfloat16) if device == "cuda" else torch.inference_mode()
	with torch.inference_mode(), ctx:
	self.sam_predictor.set_image(self.current_image)

	init_y, init_x, init_label = oracle_click(
	np.zeros_like(self.current_gt), self.current_gt,
	noise_range=self.initial_click_noise
	)
	self.click_coords.append((int(init_x), int(init_y)))
	self.click_labels.append(init_label)

	self.current_mask = self._run_sam()
	self.prev_dice = compute_dice(self.current_mask, self.current_gt)

	return self._get_obs(), {"dice": self.prev_dice, "iou": compute_iou(self.current_mask, self.current_gt)}

	def step(self, action):
	label = action % 2
	pos = action // 2
	grid_y = pos // self.grid_size
	grid_x = pos % self.grid_size

	orig_x = int((grid_x + 0.5) * self.orig_w / self.grid_size)
	orig_y = int((grid_y + 0.5) * self.orig_h / self.grid_size)
	orig_x = np.clip(orig_x, 0, self.orig_w - 1)
	orig_y = np.clip(orig_y, 0, self.orig_h - 1)

	self.click_coords.append((orig_x, orig_y))
	self.click_labels.append(1 if label == 0 else 0)

	self.current_mask = self._run_sam()

	new_dice = compute_dice(self.current_mask, self.current_gt)
	delta_dice = new_dice - self.prev_dice

	boundary_bonus = 0.0
	pred = self.current_mask.astype(bool)
	gt = self.current_gt.astype(bool)
	error_mask = pred != gt
	if error_mask.any():
	error_dist = distance_transform_edt(~error_mask)
	click_error_dist = error_dist[orig_y, orig_x]
	if click_error_dist < 10:
	boundary_bonus = 0.05 * (1.0 - click_error_dist / 10.0)

	reward = delta_dice + self.boundary_reward_weight * boundary_bonus

	self.prev_dice = new_dice
	self.n_clicks += 1
	terminated = self.n_clicks >= self.max_clicks
	truncated = False

	if new_dice > 0.95 and not terminated:
	reward += 0.1

	info = {
	"dice": new_dice, "iou": compute_iou(self.current_mask, self.current_gt),
	"delta_dice": delta_dice, "n_clicks": self.n_clicks + 1,
	"click_pos": (orig_x, orig_y), "click_label": self.click_labels[-1],
	}
	return self._get_obs(), float(reward), terminated, truncated, info

	def render(self):
	if self.render_mode == "rgb_array":
	return self._get_obs()[:, :, :3]
	return None