osherr's picture
Upload 222 files
bc90483 verified
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This software may be used and distributed in accordance with
# the terms of the DINOv3 License Agreement.
from functools import partial
import torch
from dinov3.eval.segmentation.models.backbone.dinov3_adapter import DINOv3_Adapter
from dinov3.eval.segmentation.models.heads.mask2former_head import Mask2FormerHead
BACKBONE_INTERMEDIATE_LAYERS = {
"dinov3_vits16": [2, 5, 8, 11],
"dinov3_vitb16": [2, 5, 8, 11],
"dinov3_vitl16": [4, 11, 17, 23],
"dinov3_vit7b16": [9, 19, 29, 39],
}
class FeatureDecoder(torch.nn.Module):
def __init__(self, segmentation_model: torch.nn.ModuleList, autocast_ctx):
super().__init__()
self.segmentation_model = segmentation_model
self.autocast_ctx = autocast_ctx
def forward(self, inputs):
with self.autocast_ctx():
for module in self.segmentation_model:
inputs = module.forward(inputs)
return inputs
def predict(self, inputs, rescale_to=(512, 512)):
with torch.inference_mode():
with self.autocast_ctx():
out = self.segmentation_model[0](inputs) # backbone forward
out = self.segmentation_model[1].predict(out, rescale_to=rescale_to) # decoder head prediction
return out
def build_segmentation_decoder(
backbone_model,
backbone_name,
decoder_type,
hidden_dim=2048,
num_classes=150,
autocast_dtype=torch.bfloat16,
):
autocast_ctx = partial(torch.autocast, device_type="cuda", enabled=True, dtype=autocast_dtype)
if decoder_type == "m2f":
backbone_model = DINOv3_Adapter(
backbone_model,
interaction_indexes=BACKBONE_INTERMEDIATE_LAYERS[backbone_name],
)
backbone_model.eval()
embed_dim = backbone_model.backbone.embed_dim
patch_size = backbone_model.patch_size
decoder = Mask2FormerHead(
input_shape={
"1": [embed_dim, patch_size * 4, patch_size * 4, 4],
"2": [embed_dim, patch_size * 2, patch_size * 2, 4],
"3": [embed_dim, patch_size, patch_size, 4],
"4": [embed_dim, int(patch_size / 2), int(patch_size / 2), 4],
},
hidden_dim=hidden_dim,
num_classes=num_classes,
ignore_value=255,
)
else:
raise ValueError(f'Unsupported decoder "{decoder_type}"')
segmentation_model = FeatureDecoder(
torch.nn.ModuleList(
[
backbone_model,
decoder,
]
),
autocast_ctx=autocast_ctx,
)
return segmentation_model