AnyThermal / model.py

Upload model.py with huggingface_hub

0ff30df verified about 16 hours ago

12.1 kB

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import math
	from transformers import PreTrainedModel, Dinov2Model, Dinov2Config

	# =============================================================================
	# HELPER: VPR Sinkhorn (Matches salad.py)
	# =============================================================================
	def log_otp_solver(log_a, log_b, M, num_iters: int = 20, reg: float = 1.0) -> torch.Tensor:
	M = M / reg
	u, v = torch.zeros_like(log_a), torch.zeros_like(log_b)
	for _ in range(num_iters):
	u = log_a - torch.logsumexp(M + v.unsqueeze(1), dim=2).squeeze()
	v = log_b - torch.logsumexp(M + u.unsqueeze(2), dim=1).squeeze()
	return M + u.unsqueeze(2) + v.unsqueeze(1)

	def get_matching_probs(S, dustbin_score=1.0, num_iters=3, reg=1.0):
	batch_size, m, n = S.size()
	S_aug = torch.empty(batch_size, m + 1, n, dtype=S.dtype, device=S.device)
	S_aug[:, :m, :n] = S
	S_aug[:, m, :] = dustbin_score

	norm = -torch.tensor(math.log(n + m), device=S.device)
	log_a, log_b = norm.expand(m + 1).contiguous(), norm.expand(n).contiguous()
	log_a[-1] = log_a[-1] + math.log(n - m)
	log_a, log_b = log_a.expand(batch_size, -1), log_b.expand(batch_size, -1)

	log_P = log_otp_solver(log_a, log_b, S_aug, num_iters=num_iters, reg=reg)
	return log_P - norm

	# =============================================================================
	# 1. SEGMENTATION MODEL
	# Matches NonLinearSegmentationHead64: Conv(0)->ReLU(1)->Dropout(2)->Conv(3)
	# =============================================================================
	class AnyThermalConfig(Dinov2Config):
	model_type = "anythermal"

	class AnyThermalSegmentationModel(PreTrainedModel):
	config_class = AnyThermalConfig

	def __init__(self, config):
	super().__init__(config)
	self.backbone = Dinov2Model(config)

	# Head definition matches your NonlinearHead64
	self.head = nn.Module()
	self.head.model = nn.Sequential(
	nn.Conv2d(config.hidden_size, 64, kernel_size=3, padding=1),
	nn.ReLU(inplace=True),
	nn.Dropout2d(p=0.0),
	nn.Conv2d(64, config.num_labels, kernel_size=1)
	)

	# Define Normalization constants as buffers so they move to GPU automatically
	self.register_buffer("norm_mean", torch.tensor([0.48145466, 0.4578275, 0.40821073]).view(1, 3, 1, 1))
	self.register_buffer("norm_std", torch.tensor([0.26862954, 0.26130258, 0.27577711]).view(1, 3, 1, 1))

	self.post_init()

	def preprocess_input(self, x):
	"""
	Replicates preprocess_dinov2:
	1. Resize to nearest multiple of 14
	2. Normalize with ViT stats
	"""
	B, C, H, W = x.shape
	patch_size = 14

	# 1. Dynamic Resize (Snap to grid)
	new_H = (H // patch_size) * patch_size
	new_W = (W // patch_size) * patch_size

	if new_H != H or new_W != W:
	x = F.interpolate(x, size=(new_H, new_W), mode='bilinear', align_corners=False)

	# 2. Normalize

	if x.max() > 1.0: x = x / 255.0

	x = (x - self.norm_mean) / self.norm_std
	return x

	def forward(self, pixel_values, labels=None, **kwargs):
	# --- APPLY PREPROCESSING HERE ---
	pixel_values = self.preprocess_input(pixel_values)
	# --------------------------------

	outputs = self.backbone(pixel_values, **kwargs)
	features = outputs.last_hidden_state[:, 1:, :]
	B, L, C = features.shape
	H = W = int(L**0.5)
	features = features.permute(0, 2, 1).reshape(B, C, H, W)

	logits = self.head.model(features)

	# Upscale back to input size
	logits = F.interpolate(logits, size=pixel_values.shape[-2:], mode='bilinear', align_corners=False)

	loss = None
	if labels is not None:
	loss_fct = nn.CrossEntropyLoss()
	loss = loss_fct(logits, labels)
	return {"loss": loss, "logits": logits}

	return logits


	# =============================================================================
	# 2. VPR MODEL (SALAD)
	# Matches salad.py: Conv(0)->Dropout(1)->ReLU(2)->Conv(3) + dust_bin
	# =============================================================================
	class AnyThermalVPRConfig(Dinov2Config):
	model_type = "anythermal_vpr"
	def __init__(self, num_clusters=64, cluster_dim=128, token_dim=256, **kwargs):
	super().__init__(**kwargs)
	self.num_clusters = num_clusters
	self.cluster_dim = cluster_dim
	self.token_dim = token_dim

	class SALADHead(nn.Module):
	def __init__(self, config):
	super().__init__()
	self.num_channels = config.hidden_size
	self.num_clusters = config.num_clusters
	self.cluster_dim = config.cluster_dim
	self.token_dim = config.token_dim

	self.token_features = nn.Sequential(
	nn.Linear(self.num_channels, 512),
	nn.ReLU(),
	nn.Linear(512, self.token_dim)
	)

	# Matches salad.py structure
	self.cluster_features = nn.Sequential(
	nn.Conv2d(self.num_channels, 512, 1),
	nn.Dropout(0.0),
	nn.ReLU(),
	nn.Conv2d(512, self.cluster_dim, 1)
	)

	self.score = nn.Sequential(
	nn.Conv2d(self.num_channels, 512, 1),
	nn.Dropout(0.0),
	nn.ReLU(),
	nn.Conv2d(512, self.num_clusters, 1),
	)

	self.dust_bin = nn.Parameter(torch.tensor(1.))

	def forward(self, x_tuple):
	x, t = x_tuple
	f = self.cluster_features(x).flatten(2)
	p = self.score(x).flatten(2)
	t = self.token_features(t)

	p = get_matching_probs(p, self.dust_bin, 3)
	p = torch.exp(p)
	p = p[:, :-1, :]

	p = p.unsqueeze(1).repeat(1, self.cluster_dim, 1, 1)
	f_rep = f.unsqueeze(2).repeat(1, 1, self.num_clusters, 1)

	vlad = (f_rep * p).sum(dim=-1)
	vlad = F.normalize(vlad, p=2, dim=1).flatten(1)

	combined = torch.cat([F.normalize(t, p=2, dim=-1), vlad], dim=-1)
	return F.normalize(combined, p=2, dim=-1)

	class AnyThermalVPRModel(PreTrainedModel):
	config_class = AnyThermalVPRConfig
	def __init__(self, config):
	super().__init__(config)
	self.backbone = Dinov2Model(config)
	# Sequential wrapper to match checkpoint key "0.cluster_features"
	self.vpr_head = nn.Sequential(SALADHead(config))
	self.post_init()

	def forward(self, pixel_values, **kwargs):
	outputs = self.backbone(pixel_values, **kwargs)
	patch_tokens = outputs.last_hidden_state[:, 1:, :].permute(0, 2, 1)
	B, C, L = patch_tokens.shape
	H = W = int(L**0.5)
	patch_tokens = patch_tokens.reshape(B, C, H, W)
	cls_token = outputs.last_hidden_state[:, 0, :]
	return self.vpr_head[0]((patch_tokens, cls_token))


	# =============================================================================
	# 3. DEPTH MODEL (MiDaS)
	# Matches vit.py indices: Identity(0,1,2) -> Conv(3) -> ConvTranspose(4)
	# =============================================================================
	class AnyThermalDepthConfig(Dinov2Config):
	model_type = "anythermal_depth"
	def __init__(self, features=256, **kwargs):
	super().__init__(**kwargs)
	self.features = features

	class ResidualConvUnit(nn.Module):
	def __init__(self, features):
	super().__init__()
	self.conv1 = nn.Conv2d(features, features, 3, 1, 1, bias=True)
	self.conv2 = nn.Conv2d(features, features, 3, 1, 1, bias=True)
	self.relu = nn.ReLU(inplace=True)
	def forward(self, x):
	out = self.relu(x)
	out = self.conv1(out)
	out = self.relu(out)
	out = self.conv2(out)
	return out + x

	class FeatureFusionBlock(nn.Module):
	def __init__(self, features):
	super().__init__()
	self.resConfUnit1 = ResidualConvUnit(features)
	self.resConfUnit2 = ResidualConvUnit(features)

	def forward(self, *xs):
	output = xs[0]
	if len(xs) == 2:
	if output.shape[-2:] != xs[1].shape[-2:]:
	output = F.interpolate(output, size=xs[1].shape[-2:], mode="bilinear", align_corners=True)
	output = output + self.resConfUnit1(xs[1])
	output = self.resConfUnit2(output)
	output = F.interpolate(output, scale_factor=2, mode="bilinear", align_corners=True)
	return output

	class AnyThermalDepthModel(PreTrainedModel):
	config_class = AnyThermalDepthConfig
	def __init__(self, config):
	super().__init__(config)
	self.backbone = Dinov2Model(config)
	features = config.features

	self.scratch = nn.Module()
	self.pretrained = nn.Module()

	self.scratch.layer1_rn = nn.Conv2d(96, features, 3, 1, 1, bias=False)
	self.scratch.layer2_rn = nn.Conv2d(192, features, 3, 1, 1, bias=False)
	self.scratch.layer3_rn = nn.Conv2d(384, features, 3, 1, 1, bias=False)
	self.scratch.layer4_rn = nn.Conv2d(768, features, 3, 1, 1, bias=False)

	# Padded with 3 Identities to shift Conv indices to 3 and 4
	# This aligns with the checkpoint keys (which had Slice/Transpose/Unflatten at 0-2)
	self.pretrained.act_postprocess1 = nn.Sequential(
	nn.Identity(), nn.Identity(), nn.Identity(),
	nn.Conv2d(768, 96, 1), nn.ConvTranspose2d(96, 96, 4, 4)
	)
	self.pretrained.act_postprocess2 = nn.Sequential(
	nn.Identity(), nn.Identity(), nn.Identity(),
	nn.Conv2d(768, 192, 1), nn.ConvTranspose2d(192, 192, 2, 2)
	)
	self.pretrained.act_postprocess3 = nn.Sequential(
	nn.Identity(), nn.Identity(), nn.Identity(),
	nn.Conv2d(768, 384, 1)
	)
	self.pretrained.act_postprocess4 = nn.Sequential(
	nn.Identity(), nn.Identity(), nn.Identity(),
	nn.Conv2d(768, 768, 1), nn.Conv2d(768, 768, 3, 2, 1)
	)

	self.scratch.refinenet4 = FeatureFusionBlock(features)
	self.scratch.refinenet3 = FeatureFusionBlock(features)
	self.scratch.refinenet2 = FeatureFusionBlock(features)
	self.scratch.refinenet1 = FeatureFusionBlock(features)

	self.scratch.output_conv = nn.Sequential(
	nn.Conv2d(features, 128, 3, 1, 1),
	nn.Upsample(scale_factor=1.75, mode="bilinear"),
	nn.Conv2d(128, 32, 3, 1, 1),
	nn.ReLU(True),
	nn.Conv2d(32, 1, 1, 1, 0),
	nn.ReLU(True)
	)
	self.post_init()

	def forward(self, pixel_values):
	outputs = self.backbone(pixel_values, output_hidden_states=True)
	layers = [outputs.hidden_states[i] for i in [3, 6, 9, 12]]

	def process(l, h, w):
	l = l[:, 1:, :].transpose(1, 2)
	return l.reshape(l.shape[0], l.shape[1], h//14, w//14)

	b, _, h, w = pixel_values.shape
	l1, l2, l3, l4 = [process(layers[i], h, w) for i in range(4)]

	layer_1_rn = self.scratch.layer1_rn(self.pretrained.act_postprocess1(l1))
	layer_2_rn = self.scratch.layer2_rn(self.pretrained.act_postprocess2(l2))
	layer_3_rn = self.scratch.layer3_rn(self.pretrained.act_postprocess3(l3))
	layer_4_rn = self.scratch.layer4_rn(self.pretrained.act_postprocess4(l4))

	path_4 = self.scratch.refinenet4(layer_4_rn)
	path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
	path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
	path_1 = self.scratch.refinenet1(path_2, layer_1_rn)

	return self.scratch.output_conv(path_1).squeeze(1)

	# Register all classes
	AnyThermalSegmentationModel.register_for_auto_class("AutoModel")
	AnyThermalVPRModel.register_for_auto_class("AutoModel")
	AnyThermalDepthModel.register_for_auto_class("AutoModel")