Create convert.py
Browse files- convert.py +695 -0
convert.py
ADDED
|
@@ -0,0 +1,695 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# coding=utf-8
|
| 2 |
+
# Copyright 2024 The HuggingFace Inc. team.
|
| 3 |
+
#
|
| 4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 5 |
+
# you may not use this file except in compliance with the License.
|
| 6 |
+
# You may obtain a copy of the License at
|
| 7 |
+
#
|
| 8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 9 |
+
#
|
| 10 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 13 |
+
# See the License for the specific language governing permissions and
|
| 14 |
+
# limitations under the License.
|
| 15 |
+
"""Convert RT Detr checkpoints with Timm backbone"""
|
| 16 |
+
|
| 17 |
+
import argparse
|
| 18 |
+
import json
|
| 19 |
+
from pathlib import Path
|
| 20 |
+
|
| 21 |
+
import requests
|
| 22 |
+
import torch
|
| 23 |
+
from huggingface_hub import hf_hub_download
|
| 24 |
+
from PIL import Image
|
| 25 |
+
from torchvision import transforms
|
| 26 |
+
|
| 27 |
+
from transformers import RTDetrImageProcessor
|
| 28 |
+
from modular_rtdetrv2 import RTDetrV2Config, RTDetrV2ForObjectDetection
|
| 29 |
+
from transformers.utils import logging
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
logging.set_verbosity_info()
|
| 33 |
+
logger = logging.get_logger(__name__)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def get_rt_detr_v2_config(model_name: str) -> RTDetrV2Config:
|
| 37 |
+
config = RTDetrV2Config()
|
| 38 |
+
|
| 39 |
+
config.num_labels = 80
|
| 40 |
+
repo_id = "huggingface/label-files"
|
| 41 |
+
filename = "coco-detection-mmdet-id2label.json"
|
| 42 |
+
id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
|
| 43 |
+
id2label = {int(k): v for k, v in id2label.items()}
|
| 44 |
+
config.id2label = id2label
|
| 45 |
+
config.label2id = {v: k for k, v in id2label.items()}
|
| 46 |
+
|
| 47 |
+
if model_name == "rtdetr_v2_r18vd":
|
| 48 |
+
config.backbone_config.hidden_sizes = [64, 128, 256, 512]
|
| 49 |
+
config.backbone_config.depths = [2, 2, 2, 2]
|
| 50 |
+
config.backbone_config.layer_type = "basic"
|
| 51 |
+
config.encoder_in_channels = [128, 256, 512]
|
| 52 |
+
config.hidden_expansion = 0.5
|
| 53 |
+
config.decoder_layers = 3
|
| 54 |
+
elif model_name == "rtdetr_v2_r34vd":
|
| 55 |
+
config.backbone_config.hidden_sizes = [64, 128, 256, 512]
|
| 56 |
+
config.backbone_config.depths = [3, 4, 6, 3]
|
| 57 |
+
config.backbone_config.layer_type = "basic"
|
| 58 |
+
config.encoder_in_channels = [128, 256, 512]
|
| 59 |
+
config.hidden_expansion = 0.5
|
| 60 |
+
config.decoder_layers = 4
|
| 61 |
+
elif model_name == "rtdetr_v2_r50vd_m":
|
| 62 |
+
config.hidden_expansion = 0.5
|
| 63 |
+
elif model_name == "rtdetr_v2_r50vd":
|
| 64 |
+
pass
|
| 65 |
+
elif model_name == "rtdetr_v2_r101vd":
|
| 66 |
+
config.backbone_config.depths = [3, 4, 23, 3]
|
| 67 |
+
config.encoder_ffn_dim = 2048
|
| 68 |
+
config.encoder_hidden_dim = 384
|
| 69 |
+
config.decoder_in_channels = [384, 384, 384]
|
| 70 |
+
|
| 71 |
+
return config
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def create_rename_keys(config):
|
| 75 |
+
# here we list all keys to be renamed (original name on the left, our name on the right)
|
| 76 |
+
rename_keys = []
|
| 77 |
+
|
| 78 |
+
# stem
|
| 79 |
+
# fmt: off
|
| 80 |
+
last_key = ["weight", "bias", "running_mean", "running_var"]
|
| 81 |
+
|
| 82 |
+
for level in range(3):
|
| 83 |
+
rename_keys.append((f"backbone.conv1.conv1_{level+1}.conv.weight", f"model.backbone.model.embedder.embedder.{level}.convolution.weight"))
|
| 84 |
+
for last in last_key:
|
| 85 |
+
rename_keys.append((f"backbone.conv1.conv1_{level+1}.norm.{last}", f"model.backbone.model.embedder.embedder.{level}.normalization.{last}"))
|
| 86 |
+
|
| 87 |
+
for stage_idx in range(len(config.backbone_config.depths)):
|
| 88 |
+
for layer_idx in range(config.backbone_config.depths[stage_idx]):
|
| 89 |
+
# shortcut
|
| 90 |
+
if layer_idx == 0:
|
| 91 |
+
if stage_idx == 0:
|
| 92 |
+
rename_keys.append(
|
| 93 |
+
(
|
| 94 |
+
f"backbone.res_layers.{stage_idx}.blocks.0.short.conv.weight",
|
| 95 |
+
f"model.backbone.model.encoder.stages.{stage_idx}.layers.0.shortcut.convolution.weight",
|
| 96 |
+
)
|
| 97 |
+
)
|
| 98 |
+
for last in last_key:
|
| 99 |
+
rename_keys.append(
|
| 100 |
+
(
|
| 101 |
+
f"backbone.res_layers.{stage_idx}.blocks.0.short.norm.{last}",
|
| 102 |
+
f"model.backbone.model.encoder.stages.{stage_idx}.layers.0.shortcut.normalization.{last}",
|
| 103 |
+
)
|
| 104 |
+
)
|
| 105 |
+
else:
|
| 106 |
+
rename_keys.append(
|
| 107 |
+
(
|
| 108 |
+
f"backbone.res_layers.{stage_idx}.blocks.0.short.conv.conv.weight",
|
| 109 |
+
f"model.backbone.model.encoder.stages.{stage_idx}.layers.0.shortcut.1.convolution.weight",
|
| 110 |
+
)
|
| 111 |
+
)
|
| 112 |
+
for last in last_key:
|
| 113 |
+
rename_keys.append(
|
| 114 |
+
(
|
| 115 |
+
f"backbone.res_layers.{stage_idx}.blocks.0.short.conv.norm.{last}",
|
| 116 |
+
f"model.backbone.model.encoder.stages.{stage_idx}.layers.0.shortcut.1.normalization.{last}",
|
| 117 |
+
)
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
rename_keys.append(
|
| 121 |
+
(
|
| 122 |
+
f"backbone.res_layers.{stage_idx}.blocks.{layer_idx}.branch2a.conv.weight",
|
| 123 |
+
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.0.convolution.weight",
|
| 124 |
+
)
|
| 125 |
+
)
|
| 126 |
+
for last in last_key:
|
| 127 |
+
rename_keys.append((
|
| 128 |
+
f"backbone.res_layers.{stage_idx}.blocks.{layer_idx}.branch2a.norm.{last}",
|
| 129 |
+
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.0.normalization.{last}",
|
| 130 |
+
))
|
| 131 |
+
|
| 132 |
+
rename_keys.append(
|
| 133 |
+
(
|
| 134 |
+
f"backbone.res_layers.{stage_idx}.blocks.{layer_idx}.branch2b.conv.weight",
|
| 135 |
+
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.1.convolution.weight",
|
| 136 |
+
)
|
| 137 |
+
)
|
| 138 |
+
for last in last_key:
|
| 139 |
+
rename_keys.append((
|
| 140 |
+
f"backbone.res_layers.{stage_idx}.blocks.{layer_idx}.branch2b.norm.{last}",
|
| 141 |
+
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.1.normalization.{last}",
|
| 142 |
+
))
|
| 143 |
+
|
| 144 |
+
# https://github.com/lyuwenyu/RT-DETR/blob/94f5e16708329d2f2716426868ec89aa774af016/rtdetr_pytorch/src/nn/backbone/presnet.py#L171
|
| 145 |
+
if config.backbone_config.layer_type != "basic":
|
| 146 |
+
rename_keys.append(
|
| 147 |
+
(
|
| 148 |
+
f"backbone.res_layers.{stage_idx}.blocks.{layer_idx}.branch2c.conv.weight",
|
| 149 |
+
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.2.convolution.weight",
|
| 150 |
+
)
|
| 151 |
+
)
|
| 152 |
+
for last in last_key:
|
| 153 |
+
rename_keys.append((
|
| 154 |
+
f"backbone.res_layers.{stage_idx}.blocks.{layer_idx}.branch2c.norm.{last}",
|
| 155 |
+
f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.2.normalization.{last}",
|
| 156 |
+
))
|
| 157 |
+
# fmt: on
|
| 158 |
+
|
| 159 |
+
for i in range(config.encoder_layers):
|
| 160 |
+
# encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
|
| 161 |
+
rename_keys.append(
|
| 162 |
+
(
|
| 163 |
+
f"encoder.encoder.{i}.layers.0.self_attn.out_proj.weight",
|
| 164 |
+
f"model.encoder.encoder.{i}.layers.0.self_attn.out_proj.weight",
|
| 165 |
+
)
|
| 166 |
+
)
|
| 167 |
+
rename_keys.append(
|
| 168 |
+
(
|
| 169 |
+
f"encoder.encoder.{i}.layers.0.self_attn.out_proj.bias",
|
| 170 |
+
f"model.encoder.encoder.{i}.layers.0.self_attn.out_proj.bias",
|
| 171 |
+
)
|
| 172 |
+
)
|
| 173 |
+
rename_keys.append(
|
| 174 |
+
(
|
| 175 |
+
f"encoder.encoder.{i}.layers.0.linear1.weight",
|
| 176 |
+
f"model.encoder.encoder.{i}.layers.0.fc1.weight",
|
| 177 |
+
)
|
| 178 |
+
)
|
| 179 |
+
rename_keys.append(
|
| 180 |
+
(
|
| 181 |
+
f"encoder.encoder.{i}.layers.0.linear1.bias",
|
| 182 |
+
f"model.encoder.encoder.{i}.layers.0.fc1.bias",
|
| 183 |
+
)
|
| 184 |
+
)
|
| 185 |
+
rename_keys.append(
|
| 186 |
+
(
|
| 187 |
+
f"encoder.encoder.{i}.layers.0.linear2.weight",
|
| 188 |
+
f"model.encoder.encoder.{i}.layers.0.fc2.weight",
|
| 189 |
+
)
|
| 190 |
+
)
|
| 191 |
+
rename_keys.append(
|
| 192 |
+
(
|
| 193 |
+
f"encoder.encoder.{i}.layers.0.linear2.bias",
|
| 194 |
+
f"model.encoder.encoder.{i}.layers.0.fc2.bias",
|
| 195 |
+
)
|
| 196 |
+
)
|
| 197 |
+
rename_keys.append(
|
| 198 |
+
(
|
| 199 |
+
f"encoder.encoder.{i}.layers.0.norm1.weight",
|
| 200 |
+
f"model.encoder.encoder.{i}.layers.0.self_attn_layer_norm.weight",
|
| 201 |
+
)
|
| 202 |
+
)
|
| 203 |
+
rename_keys.append(
|
| 204 |
+
(
|
| 205 |
+
f"encoder.encoder.{i}.layers.0.norm1.bias",
|
| 206 |
+
f"model.encoder.encoder.{i}.layers.0.self_attn_layer_norm.bias",
|
| 207 |
+
)
|
| 208 |
+
)
|
| 209 |
+
rename_keys.append(
|
| 210 |
+
(
|
| 211 |
+
f"encoder.encoder.{i}.layers.0.norm2.weight",
|
| 212 |
+
f"model.encoder.encoder.{i}.layers.0.final_layer_norm.weight",
|
| 213 |
+
)
|
| 214 |
+
)
|
| 215 |
+
rename_keys.append(
|
| 216 |
+
(
|
| 217 |
+
f"encoder.encoder.{i}.layers.0.norm2.bias",
|
| 218 |
+
f"model.encoder.encoder.{i}.layers.0.final_layer_norm.bias",
|
| 219 |
+
)
|
| 220 |
+
)
|
| 221 |
+
|
| 222 |
+
for j in range(0, 3):
|
| 223 |
+
rename_keys.append((f"encoder.input_proj.{j}.conv.weight", f"model.encoder_input_proj.{j}.0.weight"))
|
| 224 |
+
for last in last_key:
|
| 225 |
+
rename_keys.append((f"encoder.input_proj.{j}.norm.{last}", f"model.encoder_input_proj.{j}.1.{last}"))
|
| 226 |
+
|
| 227 |
+
block_levels = 4
|
| 228 |
+
|
| 229 |
+
for i in range(len(config.encoder_in_channels) - 1):
|
| 230 |
+
# encoder layers: hybridencoder parts
|
| 231 |
+
for j in range(1, block_levels):
|
| 232 |
+
rename_keys.append(
|
| 233 |
+
(f"encoder.fpn_blocks.{i}.conv{j}.conv.weight", f"model.encoder.fpn_blocks.{i}.conv{j}.conv.weight")
|
| 234 |
+
)
|
| 235 |
+
for last in last_key:
|
| 236 |
+
rename_keys.append(
|
| 237 |
+
(
|
| 238 |
+
f"encoder.fpn_blocks.{i}.conv{j}.norm.{last}",
|
| 239 |
+
f"model.encoder.fpn_blocks.{i}.conv{j}.norm.{last}",
|
| 240 |
+
)
|
| 241 |
+
)
|
| 242 |
+
|
| 243 |
+
rename_keys.append((f"encoder.lateral_convs.{i}.conv.weight", f"model.encoder.lateral_convs.{i}.conv.weight"))
|
| 244 |
+
for last in last_key:
|
| 245 |
+
rename_keys.append(
|
| 246 |
+
(f"encoder.lateral_convs.{i}.norm.{last}", f"model.encoder.lateral_convs.{i}.norm.{last}")
|
| 247 |
+
)
|
| 248 |
+
|
| 249 |
+
for j in range(3):
|
| 250 |
+
for k in range(1, 3):
|
| 251 |
+
rename_keys.append(
|
| 252 |
+
(
|
| 253 |
+
f"encoder.fpn_blocks.{i}.bottlenecks.{j}.conv{k}.conv.weight",
|
| 254 |
+
f"model.encoder.fpn_blocks.{i}.bottlenecks.{j}.conv{k}.conv.weight",
|
| 255 |
+
)
|
| 256 |
+
)
|
| 257 |
+
for last in last_key:
|
| 258 |
+
rename_keys.append(
|
| 259 |
+
(
|
| 260 |
+
f"encoder.fpn_blocks.{i}.bottlenecks.{j}.conv{k}.norm.{last}",
|
| 261 |
+
f"model.encoder.fpn_blocks.{i}.bottlenecks.{j}.conv{k}.norm.{last}",
|
| 262 |
+
)
|
| 263 |
+
)
|
| 264 |
+
|
| 265 |
+
for j in range(1, block_levels):
|
| 266 |
+
rename_keys.append(
|
| 267 |
+
(f"encoder.pan_blocks.{i}.conv{j}.conv.weight", f"model.encoder.pan_blocks.{i}.conv{j}.conv.weight")
|
| 268 |
+
)
|
| 269 |
+
for last in last_key:
|
| 270 |
+
rename_keys.append(
|
| 271 |
+
(
|
| 272 |
+
f"encoder.pan_blocks.{i}.conv{j}.norm.{last}",
|
| 273 |
+
f"model.encoder.pan_blocks.{i}.conv{j}.norm.{last}",
|
| 274 |
+
)
|
| 275 |
+
)
|
| 276 |
+
|
| 277 |
+
for j in range(3):
|
| 278 |
+
for k in range(1, 3):
|
| 279 |
+
rename_keys.append(
|
| 280 |
+
(
|
| 281 |
+
f"encoder.pan_blocks.{i}.bottlenecks.{j}.conv{k}.conv.weight",
|
| 282 |
+
f"model.encoder.pan_blocks.{i}.bottlenecks.{j}.conv{k}.conv.weight",
|
| 283 |
+
)
|
| 284 |
+
)
|
| 285 |
+
for last in last_key:
|
| 286 |
+
rename_keys.append(
|
| 287 |
+
(
|
| 288 |
+
f"encoder.pan_blocks.{i}.bottlenecks.{j}.conv{k}.norm.{last}",
|
| 289 |
+
f"model.encoder.pan_blocks.{i}.bottlenecks.{j}.conv{k}.norm.{last}",
|
| 290 |
+
)
|
| 291 |
+
)
|
| 292 |
+
|
| 293 |
+
rename_keys.append(
|
| 294 |
+
(f"encoder.downsample_convs.{i}.conv.weight", f"model.encoder.downsample_convs.{i}.conv.weight")
|
| 295 |
+
)
|
| 296 |
+
for last in last_key:
|
| 297 |
+
rename_keys.append(
|
| 298 |
+
(f"encoder.downsample_convs.{i}.norm.{last}", f"model.encoder.downsample_convs.{i}.norm.{last}")
|
| 299 |
+
)
|
| 300 |
+
|
| 301 |
+
for i in range(config.decoder_layers):
|
| 302 |
+
# decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms
|
| 303 |
+
rename_keys.append(
|
| 304 |
+
(
|
| 305 |
+
f"decoder.decoder.layers.{i}.self_attn.out_proj.weight",
|
| 306 |
+
f"model.decoder.layers.{i}.self_attn.out_proj.weight",
|
| 307 |
+
)
|
| 308 |
+
)
|
| 309 |
+
rename_keys.append(
|
| 310 |
+
(
|
| 311 |
+
f"decoder.decoder.layers.{i}.self_attn.out_proj.bias",
|
| 312 |
+
f"model.decoder.layers.{i}.self_attn.out_proj.bias",
|
| 313 |
+
)
|
| 314 |
+
)
|
| 315 |
+
rename_keys.append(
|
| 316 |
+
(
|
| 317 |
+
f"decoder.decoder.layers.{i}.cross_attn.sampling_offsets.weight",
|
| 318 |
+
f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.weight",
|
| 319 |
+
)
|
| 320 |
+
)
|
| 321 |
+
rename_keys.append(
|
| 322 |
+
(
|
| 323 |
+
f"decoder.decoder.layers.{i}.cross_attn.sampling_offsets.bias",
|
| 324 |
+
f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.bias",
|
| 325 |
+
)
|
| 326 |
+
)
|
| 327 |
+
rename_keys.append(
|
| 328 |
+
(
|
| 329 |
+
f"decoder.decoder.layers.{i}.cross_attn.attention_weights.weight",
|
| 330 |
+
f"model.decoder.layers.{i}.encoder_attn.attention_weights.weight",
|
| 331 |
+
)
|
| 332 |
+
)
|
| 333 |
+
rename_keys.append(
|
| 334 |
+
(
|
| 335 |
+
f"decoder.decoder.layers.{i}.cross_attn.attention_weights.bias",
|
| 336 |
+
f"model.decoder.layers.{i}.encoder_attn.attention_weights.bias",
|
| 337 |
+
)
|
| 338 |
+
)
|
| 339 |
+
rename_keys.append(
|
| 340 |
+
(
|
| 341 |
+
f"decoder.decoder.layers.{i}.cross_attn.value_proj.weight",
|
| 342 |
+
f"model.decoder.layers.{i}.encoder_attn.value_proj.weight",
|
| 343 |
+
)
|
| 344 |
+
)
|
| 345 |
+
rename_keys.append(
|
| 346 |
+
(
|
| 347 |
+
f"decoder.decoder.layers.{i}.cross_attn.value_proj.bias",
|
| 348 |
+
f"model.decoder.layers.{i}.encoder_attn.value_proj.bias",
|
| 349 |
+
)
|
| 350 |
+
)
|
| 351 |
+
rename_keys.append(
|
| 352 |
+
(
|
| 353 |
+
f"decoder.decoder.layers.{i}.cross_attn.output_proj.weight",
|
| 354 |
+
f"model.decoder.layers.{i}.encoder_attn.output_proj.weight",
|
| 355 |
+
)
|
| 356 |
+
)
|
| 357 |
+
rename_keys.append(
|
| 358 |
+
(
|
| 359 |
+
f"decoder.decoder.layers.{i}.cross_attn.output_proj.bias",
|
| 360 |
+
f"model.decoder.layers.{i}.encoder_attn.output_proj.bias",
|
| 361 |
+
)
|
| 362 |
+
)
|
| 363 |
+
rename_keys.append(
|
| 364 |
+
(f"decoder.decoder.layers.{i}.norm1.weight", f"model.decoder.layers.{i}.self_attn_layer_norm.weight")
|
| 365 |
+
)
|
| 366 |
+
rename_keys.append(
|
| 367 |
+
(f"decoder.decoder.layers.{i}.norm1.bias", f"model.decoder.layers.{i}.self_attn_layer_norm.bias")
|
| 368 |
+
)
|
| 369 |
+
rename_keys.append(
|
| 370 |
+
(f"decoder.decoder.layers.{i}.norm2.weight", f"model.decoder.layers.{i}.encoder_attn_layer_norm.weight")
|
| 371 |
+
)
|
| 372 |
+
rename_keys.append(
|
| 373 |
+
(f"decoder.decoder.layers.{i}.norm2.bias", f"model.decoder.layers.{i}.encoder_attn_layer_norm.bias")
|
| 374 |
+
)
|
| 375 |
+
rename_keys.append(
|
| 376 |
+
(
|
| 377 |
+
f"decoder.decoder.layers.{i}.cross_attn.num_points_scale",
|
| 378 |
+
f"model.decoder.layers.{i}.encoder_attn.n_points_scale",
|
| 379 |
+
)
|
| 380 |
+
)
|
| 381 |
+
rename_keys.append((f"decoder.decoder.layers.{i}.linear1.weight", f"model.decoder.layers.{i}.fc1.weight"))
|
| 382 |
+
rename_keys.append((f"decoder.decoder.layers.{i}.linear1.bias", f"model.decoder.layers.{i}.fc1.bias"))
|
| 383 |
+
rename_keys.append((f"decoder.decoder.layers.{i}.linear2.weight", f"model.decoder.layers.{i}.fc2.weight"))
|
| 384 |
+
rename_keys.append((f"decoder.decoder.layers.{i}.linear2.bias", f"model.decoder.layers.{i}.fc2.bias"))
|
| 385 |
+
rename_keys.append(
|
| 386 |
+
(f"decoder.decoder.layers.{i}.norm3.weight", f"model.decoder.layers.{i}.final_layer_norm.weight")
|
| 387 |
+
)
|
| 388 |
+
rename_keys.append(
|
| 389 |
+
(f"decoder.decoder.layers.{i}.norm3.bias", f"model.decoder.layers.{i}.final_layer_norm.bias")
|
| 390 |
+
)
|
| 391 |
+
|
| 392 |
+
for i in range(config.decoder_layers):
|
| 393 |
+
# decoder + class and bounding box heads
|
| 394 |
+
rename_keys.append(
|
| 395 |
+
(
|
| 396 |
+
f"decoder.dec_score_head.{i}.weight",
|
| 397 |
+
f"model.decoder.class_embed.{i}.weight",
|
| 398 |
+
)
|
| 399 |
+
)
|
| 400 |
+
rename_keys.append(
|
| 401 |
+
(
|
| 402 |
+
f"decoder.dec_score_head.{i}.bias",
|
| 403 |
+
f"model.decoder.class_embed.{i}.bias",
|
| 404 |
+
)
|
| 405 |
+
)
|
| 406 |
+
rename_keys.append(
|
| 407 |
+
(
|
| 408 |
+
f"decoder.dec_bbox_head.{i}.layers.0.weight",
|
| 409 |
+
f"model.decoder.bbox_embed.{i}.layers.0.weight",
|
| 410 |
+
)
|
| 411 |
+
)
|
| 412 |
+
rename_keys.append(
|
| 413 |
+
(
|
| 414 |
+
f"decoder.dec_bbox_head.{i}.layers.0.bias",
|
| 415 |
+
f"model.decoder.bbox_embed.{i}.layers.0.bias",
|
| 416 |
+
)
|
| 417 |
+
)
|
| 418 |
+
rename_keys.append(
|
| 419 |
+
(
|
| 420 |
+
f"decoder.dec_bbox_head.{i}.layers.1.weight",
|
| 421 |
+
f"model.decoder.bbox_embed.{i}.layers.1.weight",
|
| 422 |
+
)
|
| 423 |
+
)
|
| 424 |
+
rename_keys.append(
|
| 425 |
+
(
|
| 426 |
+
f"decoder.dec_bbox_head.{i}.layers.1.bias",
|
| 427 |
+
f"model.decoder.bbox_embed.{i}.layers.1.bias",
|
| 428 |
+
)
|
| 429 |
+
)
|
| 430 |
+
rename_keys.append(
|
| 431 |
+
(
|
| 432 |
+
f"decoder.dec_bbox_head.{i}.layers.2.weight",
|
| 433 |
+
f"model.decoder.bbox_embed.{i}.layers.2.weight",
|
| 434 |
+
)
|
| 435 |
+
)
|
| 436 |
+
rename_keys.append(
|
| 437 |
+
(
|
| 438 |
+
f"decoder.dec_bbox_head.{i}.layers.2.bias",
|
| 439 |
+
f"model.decoder.bbox_embed.{i}.layers.2.bias",
|
| 440 |
+
)
|
| 441 |
+
)
|
| 442 |
+
|
| 443 |
+
# decoder projection
|
| 444 |
+
for i in range(len(config.decoder_in_channels)):
|
| 445 |
+
rename_keys.append(
|
| 446 |
+
(
|
| 447 |
+
f"decoder.input_proj.{i}.conv.weight",
|
| 448 |
+
f"model.decoder_input_proj.{i}.0.weight",
|
| 449 |
+
)
|
| 450 |
+
)
|
| 451 |
+
for last in last_key:
|
| 452 |
+
rename_keys.append(
|
| 453 |
+
(
|
| 454 |
+
f"decoder.input_proj.{i}.norm.{last}",
|
| 455 |
+
f"model.decoder_input_proj.{i}.1.{last}",
|
| 456 |
+
)
|
| 457 |
+
)
|
| 458 |
+
|
| 459 |
+
# convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads
|
| 460 |
+
rename_keys.extend(
|
| 461 |
+
[
|
| 462 |
+
("decoder.denoising_class_embed.weight", "model.denoising_class_embed.weight"),
|
| 463 |
+
("decoder.query_pos_head.layers.0.weight", "model.decoder.query_pos_head.layers.0.weight"),
|
| 464 |
+
("decoder.query_pos_head.layers.0.bias", "model.decoder.query_pos_head.layers.0.bias"),
|
| 465 |
+
("decoder.query_pos_head.layers.1.weight", "model.decoder.query_pos_head.layers.1.weight"),
|
| 466 |
+
("decoder.query_pos_head.layers.1.bias", "model.decoder.query_pos_head.layers.1.bias"),
|
| 467 |
+
("decoder.enc_output.proj.weight", "model.enc_output.0.weight"),
|
| 468 |
+
("decoder.enc_output.proj.bias", "model.enc_output.0.bias"),
|
| 469 |
+
("decoder.enc_output.norm.weight", "model.enc_output.1.weight"),
|
| 470 |
+
("decoder.enc_output.norm.bias", "model.enc_output.1.bias"),
|
| 471 |
+
("decoder.enc_score_head.weight", "model.enc_score_head.weight"),
|
| 472 |
+
("decoder.enc_score_head.bias", "model.enc_score_head.bias"),
|
| 473 |
+
("decoder.enc_bbox_head.layers.0.weight", "model.enc_bbox_head.layers.0.weight"),
|
| 474 |
+
("decoder.enc_bbox_head.layers.0.bias", "model.enc_bbox_head.layers.0.bias"),
|
| 475 |
+
("decoder.enc_bbox_head.layers.1.weight", "model.enc_bbox_head.layers.1.weight"),
|
| 476 |
+
("decoder.enc_bbox_head.layers.1.bias", "model.enc_bbox_head.layers.1.bias"),
|
| 477 |
+
("decoder.enc_bbox_head.layers.2.weight", "model.enc_bbox_head.layers.2.weight"),
|
| 478 |
+
("decoder.enc_bbox_head.layers.2.bias", "model.enc_bbox_head.layers.2.bias"),
|
| 479 |
+
]
|
| 480 |
+
)
|
| 481 |
+
|
| 482 |
+
return rename_keys
|
| 483 |
+
|
| 484 |
+
|
| 485 |
+
def rename_key(state_dict, old, new):
|
| 486 |
+
try:
|
| 487 |
+
val = state_dict.pop(old)
|
| 488 |
+
state_dict[new] = val
|
| 489 |
+
except Exception:
|
| 490 |
+
pass
|
| 491 |
+
|
| 492 |
+
|
| 493 |
+
def read_in_q_k_v(state_dict, config):
|
| 494 |
+
prefix = ""
|
| 495 |
+
encoder_hidden_dim = config.encoder_hidden_dim
|
| 496 |
+
|
| 497 |
+
# first: transformer encoder
|
| 498 |
+
for i in range(config.encoder_layers):
|
| 499 |
+
# read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias)
|
| 500 |
+
in_proj_weight = state_dict.pop(f"{prefix}encoder.encoder.{i}.layers.0.self_attn.in_proj_weight")
|
| 501 |
+
in_proj_bias = state_dict.pop(f"{prefix}encoder.encoder.{i}.layers.0.self_attn.in_proj_bias")
|
| 502 |
+
# next, add query, keys and values (in that order) to the state dict
|
| 503 |
+
state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.q_proj.weight"] = in_proj_weight[
|
| 504 |
+
:encoder_hidden_dim, :
|
| 505 |
+
]
|
| 506 |
+
state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.q_proj.bias"] = in_proj_bias[:encoder_hidden_dim]
|
| 507 |
+
state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.k_proj.weight"] = in_proj_weight[
|
| 508 |
+
encoder_hidden_dim : 2 * encoder_hidden_dim, :
|
| 509 |
+
]
|
| 510 |
+
state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.k_proj.bias"] = in_proj_bias[
|
| 511 |
+
encoder_hidden_dim : 2 * encoder_hidden_dim
|
| 512 |
+
]
|
| 513 |
+
state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.v_proj.weight"] = in_proj_weight[
|
| 514 |
+
-encoder_hidden_dim:, :
|
| 515 |
+
]
|
| 516 |
+
state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.v_proj.bias"] = in_proj_bias[-encoder_hidden_dim:]
|
| 517 |
+
# next: transformer decoder (which is a bit more complex because it also includes cross-attention)
|
| 518 |
+
for i in range(config.decoder_layers):
|
| 519 |
+
# read in weights + bias of input projection layer of self-attention
|
| 520 |
+
in_proj_weight = state_dict.pop(f"{prefix}decoder.decoder.layers.{i}.self_attn.in_proj_weight")
|
| 521 |
+
in_proj_bias = state_dict.pop(f"{prefix}decoder.decoder.layers.{i}.self_attn.in_proj_bias")
|
| 522 |
+
# next, add query, keys and values (in that order) to the state dict
|
| 523 |
+
state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
|
| 524 |
+
state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
|
| 525 |
+
state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
|
| 526 |
+
state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
|
| 527 |
+
state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
|
| 528 |
+
state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
|
| 529 |
+
|
| 530 |
+
|
| 531 |
+
# We will verify our results on an image of cute cats
|
| 532 |
+
def prepare_img():
|
| 533 |
+
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
| 534 |
+
im = Image.open(requests.get(url, stream=True).raw)
|
| 535 |
+
|
| 536 |
+
return im
|
| 537 |
+
|
| 538 |
+
|
| 539 |
+
@torch.no_grad()
|
| 540 |
+
def convert_rt_detr_v2_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub, repo_id):
|
| 541 |
+
"""
|
| 542 |
+
Copy/paste/tweak model's weights to our RTDETR structure.
|
| 543 |
+
"""
|
| 544 |
+
|
| 545 |
+
# load default config
|
| 546 |
+
config = get_rt_detr_v2_config(model_name)
|
| 547 |
+
|
| 548 |
+
# load original model from torch hub
|
| 549 |
+
model_name_to_checkpoint_url = {
|
| 550 |
+
"rtdetr_v2_r18vd": "https://github.com/lyuwenyu/storage/releases/download/v0.2/rtdetrv2_r18vd_120e_coco_rerun_48.1.pth",
|
| 551 |
+
"rtdetr_v2_r34vd": "https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r34vd_120e_coco_ema.pth",
|
| 552 |
+
"rtdetr_v2_r50vd_m": "https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r50vd_m_7x_coco_ema.pth",
|
| 553 |
+
"rtdetr_v2_r50vd": "https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r50vd_6x_coco_ema.pth",
|
| 554 |
+
"rtdetr_v2_r101vd": "https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetrv2_r101vd_6x_coco_from_paddle.pth",
|
| 555 |
+
}
|
| 556 |
+
logger.info(f"Converting model {model_name}...")
|
| 557 |
+
state_dict = torch.hub.load_state_dict_from_url(model_name_to_checkpoint_url[model_name], map_location="cpu")[
|
| 558 |
+
"ema"
|
| 559 |
+
]["module"]
|
| 560 |
+
|
| 561 |
+
# rename keys
|
| 562 |
+
for src, dest in create_rename_keys(config):
|
| 563 |
+
rename_key(state_dict, src, dest)
|
| 564 |
+
# query, key and value matrices need special treatment
|
| 565 |
+
read_in_q_k_v(state_dict, config)
|
| 566 |
+
# important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
|
| 567 |
+
for key in state_dict.copy().keys():
|
| 568 |
+
if key.endswith("num_batches_tracked"):
|
| 569 |
+
del state_dict[key]
|
| 570 |
+
# for two_stage
|
| 571 |
+
if "bbox_embed" in key or ("class_embed" in key and "denoising_" not in key):
|
| 572 |
+
state_dict[key.split("model.decoder.")[-1]] = state_dict[key]
|
| 573 |
+
|
| 574 |
+
# This layer is not required since it is static layer
|
| 575 |
+
del state_dict["decoder.anchors"]
|
| 576 |
+
del state_dict["decoder.valid_mask"]
|
| 577 |
+
|
| 578 |
+
print("renaming is done ")
|
| 579 |
+
|
| 580 |
+
# finally, create HuggingFace model and load state dict
|
| 581 |
+
model = RTDetrV2ForObjectDetection(config)
|
| 582 |
+
model.load_state_dict(state_dict, strict=False)
|
| 583 |
+
model.eval()
|
| 584 |
+
|
| 585 |
+
# load image processor
|
| 586 |
+
image_processor = RTDetrImageProcessor()
|
| 587 |
+
|
| 588 |
+
# prepare image
|
| 589 |
+
img = prepare_img()
|
| 590 |
+
|
| 591 |
+
# preprocess image
|
| 592 |
+
transformations = transforms.Compose(
|
| 593 |
+
[
|
| 594 |
+
transforms.Resize([640, 640], interpolation=transforms.InterpolationMode.BILINEAR),
|
| 595 |
+
transforms.ToTensor(),
|
| 596 |
+
]
|
| 597 |
+
)
|
| 598 |
+
original_pixel_values = transformations(img).unsqueeze(0) # insert batch dimension
|
| 599 |
+
|
| 600 |
+
encoding = image_processor(images=img, return_tensors="pt")
|
| 601 |
+
pixel_values = encoding["pixel_values"]
|
| 602 |
+
|
| 603 |
+
assert torch.allclose(original_pixel_values, pixel_values)
|
| 604 |
+
|
| 605 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 606 |
+
model.to(device)
|
| 607 |
+
pixel_values = pixel_values.to(device)
|
| 608 |
+
|
| 609 |
+
# Pass image by the model
|
| 610 |
+
outputs = model(pixel_values)
|
| 611 |
+
|
| 612 |
+
if model_name == "rtdetr_v2_r18vd":
|
| 613 |
+
expected_slice_logits = torch.tensor(
|
| 614 |
+
[[-3.7045, -5.1913, -6.1787], [-4.0106, -9.3450, -5.2043], [-4.1287, -4.7463, -5.8634]]
|
| 615 |
+
)
|
| 616 |
+
expected_slice_boxes = torch.tensor(
|
| 617 |
+
[[0.2582, 0.5497, 0.4764], [0.1684, 0.1985, 0.2120], [0.7665, 0.4146, 0.4669]]
|
| 618 |
+
)
|
| 619 |
+
elif model_name == "rtdetr_v2_r34vd":
|
| 620 |
+
expected_slice_logits = torch.tensor(
|
| 621 |
+
[[-4.6108, -5.9453, -3.8505], [-3.8702, -6.1136, -5.5677], [-3.7790, -6.4538, -5.9449]]
|
| 622 |
+
)
|
| 623 |
+
expected_slice_boxes = torch.tensor(
|
| 624 |
+
[[0.1691, 0.1984, 0.2118], [0.2594, 0.5506, 0.4736], [0.7669, 0.4136, 0.4654]]
|
| 625 |
+
)
|
| 626 |
+
elif model_name == "rtdetr_v2_r50vd_m":
|
| 627 |
+
expected_slice_logits = torch.tensor(
|
| 628 |
+
[[-2.7453, -5.4595, -7.3702], [-3.1858, -5.3803, -7.9838], [-5.0293, -7.0083, -4.2888]]
|
| 629 |
+
)
|
| 630 |
+
expected_slice_boxes = torch.tensor(
|
| 631 |
+
[[0.7711, 0.4135, 0.4577], [0.2570, 0.5480, 0.4755], [0.1694, 0.1992, 0.2127]]
|
| 632 |
+
)
|
| 633 |
+
elif model_name == "rtdetr_v2_r50vd":
|
| 634 |
+
expected_slice_logits = torch.tensor(
|
| 635 |
+
[[-4.7881, -4.6754, -6.1624], [-5.4441, -6.6486, -4.3840], [-3.5455, -4.9318, -6.3544]]
|
| 636 |
+
)
|
| 637 |
+
expected_slice_boxes = torch.tensor(
|
| 638 |
+
[[0.2588, 0.5487, 0.4747], [0.5497, 0.2760, 0.0573], [0.7688, 0.4133, 0.4634]]
|
| 639 |
+
)
|
| 640 |
+
elif model_name == "rtdetr_v2_r101vd":
|
| 641 |
+
expected_slice_logits = torch.tensor(
|
| 642 |
+
[[-4.6162, -4.9189, -4.6656], [-4.4701, -4.4997, -4.9659], [-5.6641, -7.9000, -5.0725]]
|
| 643 |
+
)
|
| 644 |
+
expected_slice_boxes = torch.tensor(
|
| 645 |
+
[[0.7707, 0.4124, 0.4585], [0.2589, 0.5492, 0.4735], [0.1688, 0.1993, 0.2108]]
|
| 646 |
+
)
|
| 647 |
+
else:
|
| 648 |
+
raise ValueError(f"Unknown rt_detr_v2_name: {model_name}")
|
| 649 |
+
|
| 650 |
+
assert torch.allclose(outputs.logits[0, :3, :3], expected_slice_logits.to(outputs.logits.device), atol=1e-3)
|
| 651 |
+
assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes.to(outputs.pred_boxes.device), atol=1e-3)
|
| 652 |
+
|
| 653 |
+
if pytorch_dump_folder_path is not None:
|
| 654 |
+
Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
|
| 655 |
+
print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
|
| 656 |
+
model.save_pretrained(pytorch_dump_folder_path)
|
| 657 |
+
print(f"Saving image processor to {pytorch_dump_folder_path}")
|
| 658 |
+
image_processor.save_pretrained(pytorch_dump_folder_path)
|
| 659 |
+
|
| 660 |
+
if push_to_hub:
|
| 661 |
+
# Upload model, image processor and config to the hub
|
| 662 |
+
logger.info("Uploading PyTorch model and image processor to the hub...")
|
| 663 |
+
config.push_to_hub(
|
| 664 |
+
repo_id=repo_id,
|
| 665 |
+
commit_message="Add config from convert_rt_detr_v2_original_pytorch_checkpoint_to_pytorch.py",
|
| 666 |
+
)
|
| 667 |
+
model.push_to_hub(
|
| 668 |
+
repo_id=repo_id,
|
| 669 |
+
commit_message="Add model from convert_rt_detr_v2_original_pytorch_checkpoint_to_pytorch.py",
|
| 670 |
+
)
|
| 671 |
+
image_processor.push_to_hub(
|
| 672 |
+
repo_id=repo_id,
|
| 673 |
+
commit_message="Add image processor from convert_rt_detr_v2_original_pytorch_checkpoint_to_pytorch.py",
|
| 674 |
+
)
|
| 675 |
+
|
| 676 |
+
|
| 677 |
+
if __name__ == "__main__":
|
| 678 |
+
parser = argparse.ArgumentParser()
|
| 679 |
+
parser.add_argument(
|
| 680 |
+
"--model_name",
|
| 681 |
+
default="rtdetr_v2_r50vd",
|
| 682 |
+
type=str,
|
| 683 |
+
help="model_name of the checkpoint you'd like to convert.",
|
| 684 |
+
)
|
| 685 |
+
parser.add_argument(
|
| 686 |
+
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
|
| 687 |
+
)
|
| 688 |
+
parser.add_argument("--push_to_hub", action="store_true", help="Whether to push the model to the hub or not.")
|
| 689 |
+
parser.add_argument(
|
| 690 |
+
"--repo_id",
|
| 691 |
+
type=str,
|
| 692 |
+
help="repo_id where the model will be pushed to.",
|
| 693 |
+
)
|
| 694 |
+
args = parser.parse_args()
|
| 695 |
+
convert_rt_detr_v2_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.repo_id)
|