AI-DrivenTesting's picture
init
77da9e2
raw
history blame
7.7 kB
# ------------------------------------------------------------------------
# RF-DETR
# Copyright (c) 2025 Roboflow. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------
import torch
import torch.nn as nn
from transformers import AutoBackbone
import torch.nn.functional as F
import types
import math
import json
import os
from .dinov2_with_windowed_attn import WindowedDinov2WithRegistersConfig, WindowedDinov2WithRegistersBackbone
size_to_width = {
"tiny": 192,
"small": 384,
"base": 768,
"large": 1024,
}
size_to_config = {
"small": "dinov2_small.json",
"base": "dinov2_base.json",
"large": "dinov2_large.json",
}
size_to_config_with_registers = {
"small": "dinov2_with_registers_small.json",
"base": "dinov2_with_registers_base.json",
"large": "dinov2_with_registers_large.json",
}
def get_config(size, use_registers):
config_dict = size_to_config_with_registers if use_registers else size_to_config
current_dir = os.path.dirname(os.path.abspath(__file__))
configs_dir = os.path.join(current_dir, "dinov2_configs")
config_path = os.path.join(configs_dir, config_dict[size])
with open(config_path, "r") as f:
dino_config = json.load(f)
return dino_config
class DinoV2(nn.Module):
def __init__(self,
shape=(640, 640),
out_feature_indexes=[2, 4, 5, 9],
size="base",
use_registers=True,
use_windowed_attn=True,
gradient_checkpointing=False,
load_dinov2_weights=True,
patch_size=14,
num_windows=4,
positional_encoding_size=37,
):
super().__init__()
name = f"facebook/dinov2-with-registers-{size}" if use_registers else f"facebook/dinov2-{size}"
self.shape = shape
self.patch_size = patch_size
self.num_windows = num_windows
# Create the encoder
if not use_windowed_attn:
assert not gradient_checkpointing, "Gradient checkpointing is not supported for non-windowed attention"
assert load_dinov2_weights, "Using non-windowed attention requires loading dinov2 weights from hub"
self.encoder = AutoBackbone.from_pretrained(
name,
out_features=[f"stage{i}" for i in out_feature_indexes],
return_dict=False,
)
else:
window_block_indexes = set(range(out_feature_indexes[-1] + 1))
window_block_indexes.difference_update(out_feature_indexes)
window_block_indexes = list(window_block_indexes)
dino_config = get_config(size, use_registers)
dino_config["return_dict"] = False
dino_config["out_features"] = [f"stage{i}" for i in out_feature_indexes]
implied_resolution = positional_encoding_size * patch_size
if implied_resolution != dino_config["image_size"]:
print(f"Using a different number of positional encodings than DINOv2, which means we're not loading DINOv2 backbone weights. This is not a problem if finetuning a pretrained RF-DETR model.")
dino_config["image_size"] = implied_resolution
load_dinov2_weights = False
if patch_size != 14:
print(f"Using patch size {patch_size} instead of 14, which means we're not loading DINOv2 backbone weights. This is not a problem if finetuning a pretrained RF-DETR model.")
dino_config["patch_size"] = patch_size
load_dinov2_weights = False
if use_registers:
windowed_dino_config = WindowedDinov2WithRegistersConfig(
**dino_config,
num_windows=num_windows,
window_block_indexes=window_block_indexes,
gradient_checkpointing=gradient_checkpointing,
)
else:
windowed_dino_config = WindowedDinov2WithRegistersConfig(
**dino_config,
num_windows=num_windows,
window_block_indexes=window_block_indexes,
num_register_tokens=0,
gradient_checkpointing=gradient_checkpointing,
)
self.encoder = WindowedDinov2WithRegistersBackbone.from_pretrained(
name,
config=windowed_dino_config,
) if load_dinov2_weights else WindowedDinov2WithRegistersBackbone(windowed_dino_config)
self._out_feature_channels = [size_to_width[size]] * len(out_feature_indexes)
self._export = False
def export(self):
if self._export:
return
self._export = True
shape = self.shape
def make_new_interpolated_pos_encoding(
position_embeddings, patch_size, height, width
):
num_positions = position_embeddings.shape[1] - 1
dim = position_embeddings.shape[-1]
height = height // patch_size
width = width // patch_size
class_pos_embed = position_embeddings[:, 0]
patch_pos_embed = position_embeddings[:, 1:]
# Reshape and permute
patch_pos_embed = patch_pos_embed.reshape(
1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim
)
patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
# Use bilinear interpolation without antialias
patch_pos_embed = F.interpolate(
patch_pos_embed,
size=(height, width),
mode="bicubic",
align_corners=False,
antialias=True,
)
# Reshape back
patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).reshape(1, -1, dim)
return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
# If the shape of self.encoder.embeddings.position_embeddings
# matches the shape of your new tensor, use copy_:
with torch.no_grad():
new_positions = make_new_interpolated_pos_encoding(
self.encoder.embeddings.position_embeddings,
self.encoder.config.patch_size,
shape[0],
shape[1],
)
# Create a new Parameter with the new size
old_interpolate_pos_encoding = self.encoder.embeddings.interpolate_pos_encoding
def new_interpolate_pos_encoding(self_mod, embeddings, height, width):
num_patches = embeddings.shape[1] - 1
num_positions = self_mod.position_embeddings.shape[1] - 1
if num_patches == num_positions and height == width:
return self_mod.position_embeddings
return old_interpolate_pos_encoding(embeddings, height, width)
self.encoder.embeddings.position_embeddings = nn.Parameter(new_positions)
self.encoder.embeddings.interpolate_pos_encoding = types.MethodType(
new_interpolate_pos_encoding,
self.encoder.embeddings
)
def forward(self, x):
block_size = self.patch_size * self.num_windows
assert x.shape[2] % block_size == 0 and x.shape[3] % block_size == 0, f"Backbone requires input shape to be divisible by {block_size}, but got {x.shape}"
x = self.encoder(x)
return list(x[0])
if __name__ == "__main__":
model = DinoV2()
model.export()
x = torch.randn(1, 3, 640, 640)
print(model(x))
for j in model(x):
print(j.shape)