Publish DetectiveSAM inference bundle

7b474fb verified 26 days ago

13.5 kB

	from __future__ import annotations

	import torch
	import torch.nn as nn
	import torch.nn.functional as F


	FeaturePyramid = list[torch.Tensor]
	StreamPyramid = list[list[torch.Tensor]]


	class SharedAdapter(nn.Module):
	"""Applies a residual adapter to each feature scale."""

	def __init__(
	self,
	in_channels_list: list[int],
	hidden_dim: int,
	dropout_rate: float = 0.1,
	max_streams: int = 2,
	) -> None:
	super().__init__()
	max_streams = max(max_streams, 1)

	self.mlps_tune = nn.ModuleList(
	nn.Conv2d(max_streams * channels, hidden_dim, kernel_size=1)
	for channels in in_channels_list
	)
	self.mlps_bottleneck = nn.ModuleList(
	nn.Sequential(nn.Conv2d(hidden_dim, hidden_dim, kernel_size=1), nn.GELU())
	for _ in in_channels_list
	)
	self.mlp_up = nn.ModuleList(
	nn.Conv2d(hidden_dim, channels, kernel_size=1)
	for channels in in_channels_list
	)
	self.activation = nn.GELU()
	self.dropout = nn.Dropout2d(p=dropout_rate)

	def forward(
	self,
	stream_features: list[torch.Tensor],
	unadapted: torch.Tensor,
	scale_idx: int,
	) -> torch.Tensor:
	fused_streams = torch.cat(stream_features, dim=1) if stream_features else unadapted
	hidden = self.mlps_tune[scale_idx](fused_streams)
	hidden = self.activation(hidden)
	hidden = self.dropout(hidden)
	hidden = self.mlps_bottleneck[scale_idx](hidden)
	delta = self.mlp_up[scale_idx](hidden)
	return unadapted + delta


	class RefineBlock(nn.Module):
	"""Refines the coarse mask with low-level features."""

	def __init__(
	self,
	hidden_dim: int,
	low_channels: int,
	out_channels: int = 1,
	dropout_rate: float = 0.0,
	) -> None:
	super().__init__()
	self.conv1 = nn.Conv2d(hidden_dim + low_channels, hidden_dim, kernel_size=3, padding=1)
	self.activation1 = nn.GELU()
	self.dropout1 = nn.Dropout2d(p=dropout_rate)
	self.conv2 = nn.Conv2d(hidden_dim, hidden_dim, kernel_size=3, padding=1)
	self.activation2 = nn.GELU()
	self.dropout2 = nn.Dropout2d(p=dropout_rate)
	self.conv3 = nn.Conv2d(hidden_dim, out_channels, kernel_size=1)

	def forward(
	self,
	attention_features: torch.Tensor,
	low_features: torch.Tensor,
	coarse_upsampled: torch.Tensor,
	) -> torch.Tensor:
	refined = torch.cat([attention_features, low_features], dim=1)
	refined = self.conv1(refined)
	refined = self.activation1(refined)
	refined = self.dropout1(refined)
	refined = self.conv2(refined)
	refined = self.activation2(refined)
	refined = self.dropout2(refined)
	delta = self.conv3(refined)
	return coarse_upsampled + delta


	class CoarseProcessingBlock(nn.Module):
	"""Adds transformer-based coarse reasoning before refinement."""

	def __init__(
	self,
	hidden_dim: int,
	attn_dim: int,
	n_heads: int,
	num_encoder_layers: int,
	dropout_rate: float,
	downscale: int,
	) -> None:
	super().__init__()
	self.hidden_dim = hidden_dim
	self.coarse_down = nn.Sequential(
	nn.Conv2d(hidden_dim, hidden_dim, kernel_size=downscale, stride=downscale, groups=hidden_dim),
	nn.Conv2d(hidden_dim, hidden_dim, kernel_size=1),
	nn.GELU(),
	nn.Dropout2d(p=dropout_rate),
	)
	self.pos_embed_conv = nn.Conv2d(2, hidden_dim, kernel_size=1)
	self.pos_dropout = nn.Dropout2d(p=dropout_rate)
	self.feat_proj = nn.Sequential(
	nn.Linear(hidden_dim, attn_dim),
	nn.Dropout(p=dropout_rate),
	)
	encoder_layer = nn.TransformerEncoderLayer(
	d_model=attn_dim,
	nhead=n_heads,
	dim_feedforward=attn_dim * 4,
	dropout=dropout_rate,
	activation="gelu",
	batch_first=True,
	)
	self.transformer_encoder = nn.TransformerEncoder(
	encoder_layer,
	num_layers=num_encoder_layers,
	)
	self.transformer_out = nn.Sequential(
	nn.Linear(attn_dim, hidden_dim),
	nn.Dropout(p=dropout_rate),
	)
	self.residual_gate_conv = nn.Sequential(
	nn.Conv2d(hidden_dim * 2, hidden_dim // 4, kernel_size=3, padding=1),
	nn.GELU(),
	nn.Dropout2d(p=dropout_rate),
	nn.Conv2d(hidden_dim // 4, 1, kernel_size=1),
	)
	self.cached_pos_encodings: dict[tuple[int, int], torch.Tensor] = {}

	def _generate_pos_encoding(self, height: int, width: int) -> torch.Tensor:
	device = self.pos_embed_conv.weight.device
	y_pos = torch.linspace(-1, 1, height, device=device).view(height, 1).expand(height, width)
	x_pos = torch.linspace(-1, 1, width, device=device).view(1, width).expand(height, width)
	pos_grid = torch.stack([y_pos, x_pos], dim=0).unsqueeze(0)
	return self.pos_embed_conv(pos_grid)

	def _get_positional_encoding(self, batch_size: int, height: int, width: int) -> torch.Tensor:
	key = (height, width)
	device = self.pos_embed_conv.weight.device
	if key not in self.cached_pos_encodings:
	self.cached_pos_encodings[key] = self._generate_pos_encoding(height, width).detach()

	cached_encoding = self.cached_pos_encodings[key]
	if cached_encoding.device != device:
	cached_encoding = cached_encoding.to(device)
	self.cached_pos_encodings[key] = cached_encoding
	return cached_encoding.expand(batch_size, -1, -1, -1)

	def forward(self, fused: torch.Tensor) -> torch.Tensor:
	coarse_features = self.coarse_down(fused)
	batch_size, _, height, width = coarse_features.shape

	pos_embed = self._get_positional_encoding(batch_size, height, width)
	pos_embed = self.pos_dropout(pos_embed)
	coarse_with_position = coarse_features + pos_embed

	feature_sequence = coarse_with_position.flatten(2).permute(0, 2, 1)
	feature_sequence = self.feat_proj(feature_sequence)
	transformer_output = self.transformer_encoder(feature_sequence)
	hidden = self.transformer_out(transformer_output)
	hidden = hidden.permute(0, 2, 1).view(batch_size, self.hidden_dim, height, width)

	gate_input = torch.cat([hidden, coarse_features], dim=1)
	residual_gate = torch.sigmoid(self.residual_gate_conv(gate_input))
	return residual_gate * hidden + (1 - residual_gate) * coarse_features


	class FineProcessingBlock(nn.Module):
	"""Produces the coarse mask and uncertainty map."""

	def __init__(self, hidden_dim: int, dropout_rate: float) -> None:
	super().__init__()
	self.feature_refinement = nn.Sequential(
	nn.Conv2d(hidden_dim, hidden_dim, kernel_size=3, padding=1),
	nn.GELU(),
	nn.Dropout2d(p=dropout_rate),
	nn.Conv2d(hidden_dim, hidden_dim, kernel_size=3, padding=1),
	nn.GELU(),
	nn.Dropout2d(p=dropout_rate),
	)
	self.coarse_head = nn.Conv2d(hidden_dim, 1, kernel_size=1)
	self.uncertainty_head = nn.Conv2d(hidden_dim, 1, kernel_size=1)

	def forward(
	self,
	hidden: torch.Tensor,
	output_size: tuple[int, int],
	) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
	hidden = self.feature_refinement(hidden)
	coarse_logit = self.coarse_head(hidden)
	uncertainty_logit = self.uncertainty_head(hidden)

	coarse_mask = F.interpolate(
	coarse_logit,
	size=output_size,
	mode="bilinear",
	align_corners=False,
	)
	uncertainty_map = F.interpolate(
	uncertainty_logit,
	size=output_size,
	mode="bilinear",
	align_corners=False,
	)
	return hidden, coarse_mask, torch.sigmoid(uncertainty_map)


	class FeatureFusionBlockSpatial(nn.Module):
	"""Fuses original, adapted, and perturbed features with per-pixel attention."""

	def __init__(
	self,
	in_channels_list: list[int],
	hidden_dim: int = 128,
	dropout_rate: float = 0.1,
	max_streams: int = 2,
	attn_reduction: int = 4,
	) -> None:
	super().__init__()
	self.num_streams = 2 + max_streams
	self.att_conv = nn.ModuleList()
	self.proj_conv = nn.ModuleList()

	for channels in in_channels_list:
	total_channels = channels * self.num_streams
	mid_channels = max(total_channels // attn_reduction, 8)
	self.att_conv.append(
	nn.Sequential(
	nn.Conv2d(
	total_channels,
	mid_channels,
	kernel_size=3,
	padding=1,
	groups=self.num_streams,
	bias=False,
	),
	nn.GELU(),
	nn.Conv2d(mid_channels, self.num_streams, kernel_size=1, bias=False),
	)
	)
	self.proj_conv.append(
	nn.Sequential(
	nn.Conv2d(channels, hidden_dim, kernel_size=1),
	nn.GELU(),
	nn.Dropout2d(p=dropout_rate),
	)
	)

	fusion_channels = hidden_dim * len(in_channels_list)
	self.fuse_project = nn.Sequential(
	nn.Conv2d(fusion_channels, hidden_dim, kernel_size=1),
	nn.GELU(),
	nn.Dropout2d(p=dropout_rate),
	)

	def forward(
	self,
	adapted: FeaturePyramid,
	unadapted: FeaturePyramid,
	streams_unadapted: StreamPyramid,
	output_size: tuple[int, int],
	) -> torch.Tensor:
	fused_scales = []
	for scale_idx, (att_head, projection) in enumerate(zip(self.att_conv, self.proj_conv)):
	streams = [adapted[scale_idx], unadapted[scale_idx], *streams_unadapted[scale_idx]]
	logits = att_head(torch.cat(streams, dim=1))
	weights = F.softmax(logits, dim=1).unsqueeze(2)
	fused = (torch.stack(streams, dim=1) * weights).sum(dim=1)
	fused = projection(fused)
	fused = F.interpolate(
	fused,
	size=output_size,
	mode="bilinear",
	align_corners=False,
	)
	fused_scales.append(fused)

	return self.fuse_project(torch.cat(fused_scales, dim=1))


	class MaskAdapter(nn.Module):
	"""Builds the prompt mask passed into the SAM decoder."""

	def __init__(
	self,
	hidden_dim: int = 256,
	downscale: int = 16,
	output_resolution: tuple[int, int] = (128, 128),
	in_channels_list: list[int] \| None = None,
	attn_dim: int = 16,
	n_heads: int = 4,
	num_encoder_layers: int = 2,
	dropout_rate: float = 0.1,
	max_streams: int = 2,
	) -> None:
	super().__init__()
	channels = in_channels_list or [256, 32, 64]
	self.downscale = downscale
	self.output_resolution = output_resolution

	self.feature_fusion = FeatureFusionBlockSpatial(
	in_channels_list=channels,
	hidden_dim=hidden_dim,
	dropout_rate=dropout_rate,
	max_streams=max_streams,
	)
	self.coarse_processor = CoarseProcessingBlock(
	hidden_dim=hidden_dim,
	attn_dim=attn_dim,
	n_heads=n_heads,
	num_encoder_layers=num_encoder_layers,
	dropout_rate=dropout_rate,
	downscale=downscale,
	)
	self.fine_processor = FineProcessingBlock(hidden_dim, dropout_rate)
	self.spatial_gate = nn.Sequential(
	nn.Conv2d(2, hidden_dim // 2, kernel_size=3, padding=1),
	nn.GELU(),
	nn.Dropout2d(p=dropout_rate),
	nn.Conv2d(hidden_dim // 2, 1, kernel_size=1),
	nn.Sigmoid(),
	)
	self.refine_head = RefineBlock(
	hidden_dim=hidden_dim,
	low_channels=32,
	out_channels=1,
	dropout_rate=dropout_rate,
	)

	def forward(
	self,
	adapted: FeaturePyramid,
	streams_unadapted: StreamPyramid,
	unadapted: FeaturePyramid,
	) -> torch.Tensor:
	output_height, output_width = self.output_resolution
	output_size = (output_height, output_width)
	coarse_size = (output_height // self.downscale, output_width // self.downscale)

	fused = self.feature_fusion(adapted, unadapted, streams_unadapted, output_size)
	hidden = self.coarse_processor(fused)
	if hidden.shape[-2:] != coarse_size:
	hidden = F.adaptive_avg_pool2d(hidden, coarse_size)

	hidden, coarse_mask, uncertainty_map = self.fine_processor(hidden, output_size)
	attention_features = F.interpolate(hidden, size=output_size, mode="bilinear", align_corners=False)
	low_features = F.interpolate(unadapted[1], size=output_size, mode="bilinear", align_corners=False)
	refined_mask = self.refine_head(attention_features, low_features, coarse_mask)

	spatial_gate = self.spatial_gate(torch.cat([coarse_mask, uncertainty_map], dim=1))
	return spatial_gate * refined_mask + (1 - spatial_gate) * coarse_mask