Add/Update backbone checkpoints (count=6)
Browse files- ds_model.py +55 -51
- ds_proc.py +57 -59
- manifest_20260212_202546.json +53 -0
- models/google__efficientnet-b0/config.json +1 -1
- models/google__efficientnet-b0/ds_model.py +55 -51
- models/google__efficientnet-b0/ds_proc.py +57 -59
- models/google__efficientnet-b0/model.safetensors +1 -1
- models/google__vit-base-patch16-224/config.json +1 -1
- models/google__vit-base-patch16-224/ds_model.py +55 -51
- models/google__vit-base-patch16-224/ds_proc.py +57 -59
- models/google__vit-base-patch16-224/model.safetensors +1 -1
- models/microsoft__resnet-50/config.json +1 -1
- models/microsoft__resnet-50/ds_model.py +55 -51
- models/microsoft__resnet-50/ds_proc.py +57 -59
- models/microsoft__resnet-50/model.safetensors +1 -1
- models/microsoft__swin-tiny-patch4-window7-224/config.json +1 -1
- models/microsoft__swin-tiny-patch4-window7-224/ds_model.py +55 -51
- models/microsoft__swin-tiny-patch4-window7-224/ds_proc.py +57 -59
- models/microsoft__swin-tiny-patch4-window7-224/model.safetensors +1 -1
- models/timm__densenet121.tv_in1k/config.json +1 -1
- models/timm__densenet121.tv_in1k/ds_model.py +55 -51
- models/timm__densenet121.tv_in1k/ds_proc.py +57 -59
- models/timm__densenet121.tv_in1k/model.safetensors +1 -1
- models/torchvision__densenet121/config.json +1 -1
- models/torchvision__densenet121/ds_model.py +55 -51
- models/torchvision__densenet121/ds_proc.py +57 -59
- models/torchvision__densenet121/model.safetensors +1 -1
ds_model.py
CHANGED
|
@@ -94,14 +94,14 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 94 |
|
| 95 |
def __init__(self, config: BackboneMLPHeadConfig):
|
| 96 |
# PreTrainedModel expects a config object and stores it internally.
|
| 97 |
-
# PreTrainedModel은 config 객체를 받아 내부에
|
| 98 |
super().__init__(config)
|
| 99 |
|
| 100 |
# Fail-fast: the model is not meant to be instantiated without a valid backbone id.
|
| 101 |
-
# fail-fast: 유효한 backbone id 없이 모델을 만드는 사용 시나리오는 허용하지
|
| 102 |
#
|
| 103 |
# Note: Transformers may create configs with no args, but models are conventionally created with configs.
|
| 104 |
-
# 참고: Transformers는 config 무인자 생성이 있을 수 있으나, 모델은 관례적으로 config를 받아
|
| 105 |
if config.backbone_name_or_path is None:
|
| 106 |
raise ValueError(
|
| 107 |
"config.backbone_name_or_path is None. "
|
|
@@ -109,10 +109,10 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 109 |
)
|
| 110 |
|
| 111 |
# Fail-fast: training/inference requires a positive number of labels.
|
| 112 |
-
# fail-fast: 학습/추론은 num_labels가 양수여야
|
| 113 |
#
|
| 114 |
# Config may exist in a minimal form for internal serialization paths, but the model should not.
|
| 115 |
-
# config는 내부 직렬화 경로에서 최소 형태로 존재할 수 있으나 모델은
|
| 116 |
if int(getattr(config, "num_labels", 0)) <= 0:
|
| 117 |
raise ValueError(
|
| 118 |
f"config.num_labels must be > 0, got {getattr(config, 'num_labels', None)}. "
|
|
@@ -120,17 +120,17 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 120 |
)
|
| 121 |
|
| 122 |
# Meta is a single source of truth for extraction and fine-tuning rules.
|
| 123 |
-
# meta는 feature 추출 및 미세조정 규칙의 단일
|
| 124 |
-
|
| 125 |
# Prefer config.backbone_meta to keep Hub runtime self-contained.
|
| 126 |
self._meta = _resolve_backbone_meta(config, fallback_table=BACKBONE_META)
|
| 127 |
|
| 128 |
# Backbone skeleton is always created without pretrained weights.
|
| 129 |
-
# backbone skeleton은 항상 pretrained weight 없이
|
| 130 |
self.backbone = self._build_backbone_skeleton(config.backbone_name_or_path)
|
| 131 |
|
| 132 |
# Head shape is driven by meta feat_dim and config.num_labels.
|
| 133 |
-
# head shape은 meta의 feat_dim과 config.num_labels로
|
| 134 |
self.classifier = MLPHead(
|
| 135 |
in_dim=int(self._meta["feat_dim"]),
|
| 136 |
num_labels=int(config.num_labels),
|
|
@@ -139,16 +139,20 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 139 |
)
|
| 140 |
|
| 141 |
# HF initialization hook, but we override init_weights to initialize head-only.
|
| 142 |
-
# HF 초기화 훅이지만 init_weights를 override하여 head만
|
| 143 |
self.post_init()
|
| 144 |
|
| 145 |
def init_weights(self):
|
| 146 |
"""
|
| 147 |
Initialize only the head to avoid touching the backbone skeleton.
|
| 148 |
-
backbone skeleton을 건드리지 않기 위해 head만
|
| 149 |
|
| 150 |
HF's default init may traverse the entire module tree, which is undesirable here.
|
| 151 |
-
HF 기본 init은 전체 모듈 트리를 순회할 수 있어
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
"""
|
| 153 |
if getattr(self, "classifier", None) is not None:
|
| 154 |
self.classifier.apply(self._init_weights)
|
|
@@ -160,7 +164,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 160 |
# ----------------------------
|
| 161 |
def _build_backbone_skeleton(self, backbone_id: str) -> nn.Module:
|
| 162 |
# Meta decides which loader path to use.
|
| 163 |
-
# meta가 어떤 로더 경로를 사용할지
|
| 164 |
meta = self._meta if backbone_id == self.config.backbone_name_or_path else BACKBONE_META.get(backbone_id)
|
| 165 |
if meta is None:
|
| 166 |
raise KeyError(f"Unknown backbone_id={backbone_id}. Provide backbone_meta in config or extend BACKBONE_META.")
|
|
@@ -174,14 +178,14 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 174 |
return self._build_torchvision_densenet_skeleton(backbone_id)
|
| 175 |
|
| 176 |
# For transformers backbones: build a random-weight skeleton from config only.
|
| 177 |
-
# transformers 백본: config로부터 랜덤 초기화 skeleton만
|
| 178 |
bb_cfg = AutoConfig.from_pretrained(backbone_id)
|
| 179 |
return AutoModel.from_config(bb_cfg)
|
| 180 |
|
| 181 |
@staticmethod
|
| 182 |
def _build_timm_densenet_skeleton(hf_repo_id: str) -> nn.Module:
|
| 183 |
# timm is an optional dependency and should be imported lazily.
|
| 184 |
-
# timm은 옵션 의존성이므로 지연 import
|
| 185 |
try:
|
| 186 |
import timm
|
| 187 |
except Exception as e:
|
|
@@ -190,7 +194,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 190 |
) from e
|
| 191 |
|
| 192 |
# Build structure only (pretrained=False) and remove classifier head (num_classes=0).
|
| 193 |
-
# 구조만 생성(pretrained=False)하고 분류기 head는 제거(num_classes=0)
|
| 194 |
return timm.create_model(
|
| 195 |
f"hf_hub:{hf_repo_id}",
|
| 196 |
pretrained=False,
|
|
@@ -200,12 +204,12 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 200 |
@staticmethod
|
| 201 |
def _build_torchvision_densenet_skeleton(model_id: str) -> nn.Module:
|
| 202 |
# This project intentionally supports only torchvision/densenet121 in the 224 whitelist.
|
| 203 |
-
# 이 프로젝트는 224 화이트리스트에서 torchvision/densenet121만 의도적으로
|
| 204 |
if model_id != "torchvision/densenet121":
|
| 205 |
raise ValueError(f"Unsupported torchvision DenseNet id (224 whitelist only): {model_id}")
|
| 206 |
|
| 207 |
# Build structure only (weights=None) to avoid implicit pretrained loading.
|
| 208 |
-
# implicit pretrained 로드를 피하기 위해 구조만 생성(weights=None)
|
| 209 |
m = tv_models.densenet121(weights=None)
|
| 210 |
return m
|
| 211 |
|
|
@@ -222,10 +226,10 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 222 |
):
|
| 223 |
"""
|
| 224 |
Fresh-start only: inject pretrained backbone weights into the skeleton.
|
| 225 |
-
fresh-start 전용: skeleton backbone에 pretrained 가중치를
|
| 226 |
|
| 227 |
Do NOT call this after from_pretrained() because it would overwrite checkpoint weights.
|
| 228 |
-
from_pretrained() 이후 호출하면 체크포인트 가중치를 덮어쓰므로
|
| 229 |
"""
|
| 230 |
bb = self.config.backbone_name_or_path
|
| 231 |
meta = self._meta
|
|
@@ -240,7 +244,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 240 |
return
|
| 241 |
|
| 242 |
# For transformers backbones, load a reference pretrained model and copy weights into our skeleton.
|
| 243 |
-
# transformers 백본은 reference pretrained 모델을 로드한 뒤 skeleton에 가중치를
|
| 244 |
ref = AutoModel.from_pretrained(
|
| 245 |
bb,
|
| 246 |
low_cpu_mem_usage=low_cpu_mem_usage,
|
|
@@ -248,18 +252,18 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 248 |
)
|
| 249 |
|
| 250 |
# strict=False is used to tolerate harmless key differences across minor versions.
|
| 251 |
-
# strict=False는 마이너 버전 차이로 인한 무해한 키 차이를 허용하기 위해
|
| 252 |
self.backbone.load_state_dict(ref.state_dict(), strict=False)
|
| 253 |
del ref
|
| 254 |
|
| 255 |
@torch.no_grad()
|
| 256 |
def _load_timm_pretrained_into_skeleton_(self, hf_repo_id: str):
|
| 257 |
# timm must be present for timm backbones.
|
| 258 |
-
# timm
|
| 259 |
import timm
|
| 260 |
|
| 261 |
# Create a pretrained reference model and copy its weights strictly.
|
| 262 |
-
# pretrained reference 모델을 만들고 가중치를 strict하게
|
| 263 |
ref = timm.create_model(
|
| 264 |
f"hf_hub:{hf_repo_id}",
|
| 265 |
pretrained=True,
|
|
@@ -272,12 +276,12 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 272 |
@torch.no_grad()
|
| 273 |
def _load_torchvision_pretrained_into_skeleton_(self, model_id: str):
|
| 274 |
# This project intentionally supports only torchvision/densenet121 in the 224 whitelist.
|
| 275 |
-
# 이 프로젝트는 224 화이트리스트에서 torchvision/densenet121만
|
| 276 |
if model_id != "torchvision/densenet121":
|
| 277 |
raise ValueError(f"Unsupported torchvision DenseNet id (224 whitelist only): {model_id}")
|
| 278 |
|
| 279 |
# Use torchvision's default pretrained weights for densenet121.
|
| 280 |
-
# torchvision의 densenet121 기본 pretrained weights를
|
| 281 |
ref = tv_models.densenet121(weights=tv_models.DenseNet121_Weights.DEFAULT).eval()
|
| 282 |
|
| 283 |
self.backbone.load_state_dict(ref.state_dict(), strict=True)
|
|
@@ -290,7 +294,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 290 |
@staticmethod
|
| 291 |
def _pool_or_gap(outputs) -> torch.Tensor:
|
| 292 |
# Some transformers vision CNNs provide pooler_output explicitly.
|
| 293 |
-
# 일부 transformers vision CNN은 pooler_output을 명시적으로
|
| 294 |
if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
|
| 295 |
x = outputs.pooler_output
|
| 296 |
if x.dim() == 2:
|
|
@@ -300,7 +304,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 300 |
raise RuntimeError(f"Unexpected pooler_output shape: {tuple(x.shape)}")
|
| 301 |
|
| 302 |
# Otherwise we expect a CNN-style last_hidden_state=(B,C,H,W) and apply GAP.
|
| 303 |
-
# 그렇지 않으면 CNN 스타일 last_hidden_state=(B,C,H,W)를 기대하고 GAP을
|
| 304 |
x = outputs.last_hidden_state
|
| 305 |
if x.dim() == 4:
|
| 306 |
return x.mean(dim=(2, 3))
|
|
@@ -312,29 +316,29 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 312 |
|
| 313 |
def _extract_features(self, outputs, pixel_values: Optional[torch.Tensor] = None) -> torch.Tensor:
|
| 314 |
# Feature rule is defined by BACKBONE_META and must remain stable across saves/loads.
|
| 315 |
-
# feature 규칙은 BACKBONE_META로 정의되며 저장/로드 간
|
| 316 |
rule = self._meta["feat_rule"]
|
| 317 |
|
| 318 |
if rule == "cls":
|
| 319 |
# ViT-style: use CLS token embedding from last_hidden_state.
|
| 320 |
-
# ViT 스타일: last_hidden_state에서 CLS 토큰 임베딩을
|
| 321 |
return outputs.last_hidden_state[:, 0, :]
|
| 322 |
|
| 323 |
if rule == "pool_or_mean":
|
| 324 |
# Swin-style: prefer pooler_output if present, else mean-pool over tokens.
|
| 325 |
-
# Swin 스타일: pooler_output이 있으면 우선 사용하고, 없으면 토큰 평균 풀링을
|
| 326 |
if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
|
| 327 |
return outputs.pooler_output
|
| 328 |
return outputs.last_hidden_state.mean(dim=1)
|
| 329 |
|
| 330 |
if rule == "pool_or_gap":
|
| 331 |
# CNN-style: use pooler_output if present, else GAP over spatial dims.
|
| 332 |
-
# CNN 스타일: pooler_output이 있으면 사용하고, 없으면 공간 차원 GAP을
|
| 333 |
return self._pool_or_gap(outputs)
|
| 334 |
|
| 335 |
if rule == "timm_gap":
|
| 336 |
# timm forward_features returns a feature map (B,C,H,W) which we GAP to (B,C).
|
| 337 |
-
# timm forward_features는 (B,C,H,W) feature map을 반환하며 이를 GAP으로 (B,C)로
|
| 338 |
if not isinstance(outputs, torch.Tensor):
|
| 339 |
raise TypeError(f"timm_gap expects Tensor features, got {type(outputs)}")
|
| 340 |
if outputs.dim() != 4:
|
|
@@ -343,7 +347,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 343 |
|
| 344 |
if rule == "torchvision_densenet_gap":
|
| 345 |
# torchvision DenseNet features are feature maps (B,C,H,W) and require GAP.
|
| 346 |
-
# torchvision DenseNet features는 (B,C,H,W) feature map이며 GAP이
|
| 347 |
if not isinstance(outputs, torch.Tensor):
|
| 348 |
raise TypeError(f"torchvision_densenet_gap expects Tensor, got {type(outputs)}")
|
| 349 |
if outputs.dim() != 4:
|
|
@@ -362,7 +366,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 362 |
**kwargs,
|
| 363 |
):
|
| 364 |
# Type decides the backbone forward path and output format.
|
| 365 |
-
# type이 backbone forward 경로 및 출력 포맷을
|
| 366 |
t = self._meta["type"]
|
| 367 |
|
| 368 |
if t == "timm_densenet":
|
|
@@ -394,7 +398,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 394 |
|
| 395 |
else:
|
| 396 |
# Transformers vision models are called with pixel_values and return ModelOutput.
|
| 397 |
-
# transformers vision 모델은 pixel_values로 호출되며 ModelOutput을
|
| 398 |
outputs = self.backbone(
|
| 399 |
pixel_values=pixel_values,
|
| 400 |
output_attentions=output_attentions,
|
|
@@ -407,13 +411,13 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 407 |
attentions = getattr(outputs, "attentions", None)
|
| 408 |
|
| 409 |
# Classifier consumes (B, feat_dim) and returns logits (B, num_labels).
|
| 410 |
-
# classifier는 (B, feat_dim)을 받아 logits (B, num_labels)를
|
| 411 |
logits = self.classifier(feats)
|
| 412 |
|
| 413 |
loss = None
|
| 414 |
if labels is not None:
|
| 415 |
# Cross entropy expects labels as class indices in [0, num_labels).
|
| 416 |
-
# cross entropy는 labels가 [0, num_labels) 범위의 class index이길
|
| 417 |
loss = F.cross_entropy(logits, labels)
|
| 418 |
|
| 419 |
if not return_dict:
|
|
@@ -434,14 +438,14 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 434 |
# ============================================================
|
| 435 |
def _set_requires_grad(module: nn.Module, flag: bool):
|
| 436 |
# Toggle requires_grad for all parameters in a module.
|
| 437 |
-
# 모듈의 모든 파라미터에 대해 requires_grad를
|
| 438 |
for p in module.parameters():
|
| 439 |
p.requires_grad = flag
|
| 440 |
|
| 441 |
|
| 442 |
def set_bn_eval(module: nn.Module):
|
| 443 |
# Put BatchNorm layers into eval mode to freeze running stats.
|
| 444 |
-
# BatchNorm 레이어를 eval 모드로 두어 running stats를
|
| 445 |
for m in module.modules():
|
| 446 |
if isinstance(m, (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.SyncBatchNorm)):
|
| 447 |
m.eval()
|
|
@@ -449,7 +453,7 @@ def set_bn_eval(module: nn.Module):
|
|
| 449 |
|
| 450 |
def freeze_backbone(model: BackboneWithMLPHeadForImageClassification, freeze_bn: bool = True):
|
| 451 |
# Stage1: freeze backbone and train only the head.
|
| 452 |
-
# stage1: backbone을 freeze하고 head만
|
| 453 |
_set_requires_grad(model.backbone, False)
|
| 454 |
_set_requires_grad(model.classifier, True)
|
| 455 |
|
|
@@ -460,7 +464,7 @@ def freeze_backbone(model: BackboneWithMLPHeadForImageClassification, freeze_bn:
|
|
| 460 |
|
| 461 |
def finetune_train_mode(model: BackboneWithMLPHeadForImageClassification, keep_bn_eval: bool = True):
|
| 462 |
# Stage2: train mode, optionally keeping BN layers in eval for stability.
|
| 463 |
-
# stage2: train 모드로 두되 안정성을 위해 BN을 eval로 유지할 수
|
| 464 |
model.train()
|
| 465 |
meta = getattr(model, "_meta", None) or getattr(model.config, "backbone_meta", None)
|
| 466 |
if keep_bn_eval and meta.get("has_bn", False):
|
|
@@ -469,7 +473,7 @@ def finetune_train_mode(model: BackboneWithMLPHeadForImageClassification, keep_b
|
|
| 469 |
|
| 470 |
def trainable_summary(model: nn.Module):
|
| 471 |
# Print a compact summary of trainable parameters.
|
| 472 |
-
# 학습 가능 파라미터 요약을 간단히
|
| 473 |
total = sum(p.numel() for p in model.parameters())
|
| 474 |
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
| 475 |
ratio = trainable / total if total > 0 else 0.0
|
|
@@ -483,7 +487,7 @@ def unfreeze_last_stage(
|
|
| 483 |
keep_bn_eval: bool = True,
|
| 484 |
):
|
| 485 |
# This utility implements BACKBONE_META['unfreeze']=="last_n" across supported backbones.
|
| 486 |
-
# 이 유틸은 지원 백본들에 대해 BACKBONE_META['unfreeze']=="last_n"을
|
| 487 |
freeze_backbone(model, freeze_bn=keep_bn_eval)
|
| 488 |
|
| 489 |
n = int(last_n)
|
|
@@ -498,7 +502,7 @@ def unfreeze_last_stage(
|
|
| 498 |
|
| 499 |
if bb_type == "vit":
|
| 500 |
# ViT blocks live under backbone.encoder.layer in the transformers implementation.
|
| 501 |
-
# ViT 블록은 transformers 구현에서 backbone.encoder.layer 아래에
|
| 502 |
blocks = list(model.backbone.encoder.layer)
|
| 503 |
for blk in blocks[-n:]:
|
| 504 |
_set_requires_grad(blk, True)
|
|
@@ -506,7 +510,7 @@ def unfreeze_last_stage(
|
|
| 506 |
|
| 507 |
if bb_type == "swin":
|
| 508 |
# Swin blocks are nested by stages and blocks; we flatten and unfreeze last n blocks.
|
| 509 |
-
# Swin 블록은 stage와 block으로 중첩되어 있어 펼친 후 마지막 n개를 unfreeze
|
| 510 |
stages = list(model.backbone.encoder.layers)
|
| 511 |
blocks: List[nn.Module] = []
|
| 512 |
for st in stages:
|
|
@@ -517,7 +521,7 @@ def unfreeze_last_stage(
|
|
| 517 |
|
| 518 |
if bb_type == "resnet":
|
| 519 |
# ResNet uses layer1..layer4 stages; we unfreeze at block granularity.
|
| 520 |
-
# ResNet은 layer1..layer4 stage를 사용하며 block 단위로 unfreeze
|
| 521 |
bb = model.backbone
|
| 522 |
for name in ("layer1", "layer2", "layer3", "layer4"):
|
| 523 |
if not hasattr(bb, name):
|
|
@@ -538,7 +542,7 @@ def unfreeze_last_stage(
|
|
| 538 |
|
| 539 |
if bb_type == "efficientnet":
|
| 540 |
# EfficientNet in transformers exposes features; we unfreeze from the tail blocks.
|
| 541 |
-
# transformers EfficientNet은 features를 노출하며 뒤쪽 블록부터 unfreeze
|
| 542 |
bb = model.backbone
|
| 543 |
if not hasattr(bb, "features"):
|
| 544 |
raise RuntimeError("Unexpected EfficientNet structure: missing features")
|
|
@@ -556,7 +560,7 @@ def unfreeze_last_stage(
|
|
| 556 |
|
| 557 |
if bb_type in ("timm_densenet", "torchvision_densenet"):
|
| 558 |
# DenseNet exposes a .features module with named blocks; we unfreeze last n submodules.
|
| 559 |
-
# DenseNet은 .features 모듈에 블록들이 이름으로 존재하며 마지막 n개 서브모듈을 unfreeze
|
| 560 |
bb = model.backbone
|
| 561 |
if not hasattr(bb, "features"):
|
| 562 |
raise RuntimeError("Unexpected DenseNet: missing features")
|
|
@@ -575,7 +579,7 @@ def unfreeze_last_stage(
|
|
| 575 |
|
| 576 |
def _denselayers(db: nn.Module) -> List[nn.Module]:
|
| 577 |
# Dense blocks contain multiple DenseLayer children; we return them for fine-grained unfreezing.
|
| 578 |
-
# denseblock은 DenseLayer 자식들을 가지므로 세밀한 unfreeze를 위해 이를
|
| 579 |
return list(db.children())
|
| 580 |
|
| 581 |
blocks: List[nn.Module] = []
|
|
@@ -600,5 +604,5 @@ def unfreeze_last_stage(
|
|
| 600 |
# register
|
| 601 |
# -------------------------
|
| 602 |
# Register for AutoModelForImageClassification so from_pretrained can resolve this custom class.
|
| 603 |
-
# from_pretrained가 이 커스텀 클래스를 해석할 수 있도록 AutoModelForImageClassification에
|
| 604 |
BackboneWithMLPHeadForImageClassification.register_for_auto_class("AutoModelForImageClassification")
|
|
|
|
| 94 |
|
| 95 |
def __init__(self, config: BackboneMLPHeadConfig):
|
| 96 |
# PreTrainedModel expects a config object and stores it internally.
|
| 97 |
+
# PreTrainedModel은 config 객체를 받아 내부에 저장함.
|
| 98 |
super().__init__(config)
|
| 99 |
|
| 100 |
# Fail-fast: the model is not meant to be instantiated without a valid backbone id.
|
| 101 |
+
# fail-fast: 유효한 backbone id 없이 모델을 만드는 사용 시나리오는 허용하지 않음 - fast fail.
|
| 102 |
#
|
| 103 |
# Note: Transformers may create configs with no args, but models are conventionally created with configs.
|
| 104 |
+
# 참고: Transformers는 config 무인자 생성이 있을 수 있으나, 모델은 관례적으로 config를 받아 생성.
|
| 105 |
if config.backbone_name_or_path is None:
|
| 106 |
raise ValueError(
|
| 107 |
"config.backbone_name_or_path is None. "
|
|
|
|
| 109 |
)
|
| 110 |
|
| 111 |
# Fail-fast: training/inference requires a positive number of labels.
|
| 112 |
+
# fail-fast: 학습/추론은 num_labels가 양수여야 함.
|
| 113 |
#
|
| 114 |
# Config may exist in a minimal form for internal serialization paths, but the model should not.
|
| 115 |
+
# config는 내부 직렬화 경로에서 최소 형태로 존재할 수 있으나 모델은 해당 없음.
|
| 116 |
if int(getattr(config, "num_labels", 0)) <= 0:
|
| 117 |
raise ValueError(
|
| 118 |
f"config.num_labels must be > 0, got {getattr(config, 'num_labels', None)}. "
|
|
|
|
| 120 |
)
|
| 121 |
|
| 122 |
# Meta is a single source of truth for extraction and fine-tuning rules.
|
| 123 |
+
# meta는 feature 추출 및 미세조정 규칙의 단일 기준.
|
| 124 |
+
# Resolve backbone meta from config (preferred) or fallback table (for backward compatibility).
|
| 125 |
# Prefer config.backbone_meta to keep Hub runtime self-contained.
|
| 126 |
self._meta = _resolve_backbone_meta(config, fallback_table=BACKBONE_META)
|
| 127 |
|
| 128 |
# Backbone skeleton is always created without pretrained weights.
|
| 129 |
+
# backbone skeleton은 항상 pretrained weight 없이 생성.
|
| 130 |
self.backbone = self._build_backbone_skeleton(config.backbone_name_or_path)
|
| 131 |
|
| 132 |
# Head shape is driven by meta feat_dim and config.num_labels.
|
| 133 |
+
# head shape은 meta의 feat_dim과 config.num_labels로 결정.
|
| 134 |
self.classifier = MLPHead(
|
| 135 |
in_dim=int(self._meta["feat_dim"]),
|
| 136 |
num_labels=int(config.num_labels),
|
|
|
|
| 139 |
)
|
| 140 |
|
| 141 |
# HF initialization hook, but we override init_weights to initialize head-only.
|
| 142 |
+
# HF 초기화 훅이지만 init_weights를 override하여 head만 초기화하도록 변경.
|
| 143 |
self.post_init()
|
| 144 |
|
| 145 |
def init_weights(self):
|
| 146 |
"""
|
| 147 |
Initialize only the head to avoid touching the backbone skeleton.
|
| 148 |
+
backbone skeleton을 건드리지 않기 위해 head만 초기화.
|
| 149 |
|
| 150 |
HF's default init may traverse the entire module tree, which is undesirable here.
|
| 151 |
+
HF 기본 init은 전체 모듈 트리를 순회할 수 있어 여기서 그대로 사용하기 부적절.
|
| 152 |
+
|
| 153 |
+
초기 설계에서 __init__ 내부에서 backbone의 가중치 로드를 수행함(편리를 위해).
|
| 154 |
+
이 경우, HF의 post_init()으로 인해 해당 로드가 취소되는 경우가 존재(timm, torchvision 등의 백본).
|
| 155 |
+
때문에 이를 오버라이드 하여 classifier만 초기화 하도록 변경함.
|
| 156 |
"""
|
| 157 |
if getattr(self, "classifier", None) is not None:
|
| 158 |
self.classifier.apply(self._init_weights)
|
|
|
|
| 164 |
# ----------------------------
|
| 165 |
def _build_backbone_skeleton(self, backbone_id: str) -> nn.Module:
|
| 166 |
# Meta decides which loader path to use.
|
| 167 |
+
# meta가 어떤 로더 경로를 사용할지 결정.
|
| 168 |
meta = self._meta if backbone_id == self.config.backbone_name_or_path else BACKBONE_META.get(backbone_id)
|
| 169 |
if meta is None:
|
| 170 |
raise KeyError(f"Unknown backbone_id={backbone_id}. Provide backbone_meta in config or extend BACKBONE_META.")
|
|
|
|
| 178 |
return self._build_torchvision_densenet_skeleton(backbone_id)
|
| 179 |
|
| 180 |
# For transformers backbones: build a random-weight skeleton from config only.
|
| 181 |
+
# transformers 백본: config로부터 랜덤 초기화 skeleton만 생성.
|
| 182 |
bb_cfg = AutoConfig.from_pretrained(backbone_id)
|
| 183 |
return AutoModel.from_config(bb_cfg)
|
| 184 |
|
| 185 |
@staticmethod
|
| 186 |
def _build_timm_densenet_skeleton(hf_repo_id: str) -> nn.Module:
|
| 187 |
# timm is an optional dependency and should be imported lazily.
|
| 188 |
+
# timm은 옵션 의존성이므로 지연 import 수행.
|
| 189 |
try:
|
| 190 |
import timm
|
| 191 |
except Exception as e:
|
|
|
|
| 194 |
) from e
|
| 195 |
|
| 196 |
# Build structure only (pretrained=False) and remove classifier head (num_classes=0).
|
| 197 |
+
# 구조만 생성(pretrained=False)하고 분류기 head는 제거(num_classes=0).
|
| 198 |
return timm.create_model(
|
| 199 |
f"hf_hub:{hf_repo_id}",
|
| 200 |
pretrained=False,
|
|
|
|
| 204 |
@staticmethod
|
| 205 |
def _build_torchvision_densenet_skeleton(model_id: str) -> nn.Module:
|
| 206 |
# This project intentionally supports only torchvision/densenet121 in the 224 whitelist.
|
| 207 |
+
# 이 프로젝트는 224 화이트리스트에서 torchvision/densenet121만 의도적으로 지원.
|
| 208 |
if model_id != "torchvision/densenet121":
|
| 209 |
raise ValueError(f"Unsupported torchvision DenseNet id (224 whitelist only): {model_id}")
|
| 210 |
|
| 211 |
# Build structure only (weights=None) to avoid implicit pretrained loading.
|
| 212 |
+
# implicit pretrained 로드를 피하기 위해 구조만 생성(weights=None).
|
| 213 |
m = tv_models.densenet121(weights=None)
|
| 214 |
return m
|
| 215 |
|
|
|
|
| 226 |
):
|
| 227 |
"""
|
| 228 |
Fresh-start only: inject pretrained backbone weights into the skeleton.
|
| 229 |
+
fresh-start 전용: skeleton backbone에 pretrained 가중치를 주입.
|
| 230 |
|
| 231 |
Do NOT call this after from_pretrained() because it would overwrite checkpoint weights.
|
| 232 |
+
from_pretrained() 이후 호출하면 체크포인트 가중치를 덮어쓰므로 주의할 것.
|
| 233 |
"""
|
| 234 |
bb = self.config.backbone_name_or_path
|
| 235 |
meta = self._meta
|
|
|
|
| 244 |
return
|
| 245 |
|
| 246 |
# For transformers backbones, load a reference pretrained model and copy weights into our skeleton.
|
| 247 |
+
# transformers 백본은 reference pretrained 모델을 로드한 뒤 skeleton에 가중치를 복사.
|
| 248 |
ref = AutoModel.from_pretrained(
|
| 249 |
bb,
|
| 250 |
low_cpu_mem_usage=low_cpu_mem_usage,
|
|
|
|
| 252 |
)
|
| 253 |
|
| 254 |
# strict=False is used to tolerate harmless key differences across minor versions.
|
| 255 |
+
# strict=False는 마이너 버전 차이로 인한 무해한 키 차이를 허용하기 위해 사용.
|
| 256 |
self.backbone.load_state_dict(ref.state_dict(), strict=False)
|
| 257 |
del ref
|
| 258 |
|
| 259 |
@torch.no_grad()
|
| 260 |
def _load_timm_pretrained_into_skeleton_(self, hf_repo_id: str):
|
| 261 |
# timm must be present for timm backbones.
|
| 262 |
+
# timm 백본에��� timm 설치가 필요.
|
| 263 |
import timm
|
| 264 |
|
| 265 |
# Create a pretrained reference model and copy its weights strictly.
|
| 266 |
+
# pretrained reference 모델을 만들고 가중치를 strict하게 복사.
|
| 267 |
ref = timm.create_model(
|
| 268 |
f"hf_hub:{hf_repo_id}",
|
| 269 |
pretrained=True,
|
|
|
|
| 276 |
@torch.no_grad()
|
| 277 |
def _load_torchvision_pretrained_into_skeleton_(self, model_id: str):
|
| 278 |
# This project intentionally supports only torchvision/densenet121 in the 224 whitelist.
|
| 279 |
+
# 이 프로젝트는 224 화이트리스트에서 torchvision/densenet121만 지원.
|
| 280 |
if model_id != "torchvision/densenet121":
|
| 281 |
raise ValueError(f"Unsupported torchvision DenseNet id (224 whitelist only): {model_id}")
|
| 282 |
|
| 283 |
# Use torchvision's default pretrained weights for densenet121.
|
| 284 |
+
# torchvision의 densenet121 기본 pretrained weights를 사용.
|
| 285 |
ref = tv_models.densenet121(weights=tv_models.DenseNet121_Weights.DEFAULT).eval()
|
| 286 |
|
| 287 |
self.backbone.load_state_dict(ref.state_dict(), strict=True)
|
|
|
|
| 294 |
@staticmethod
|
| 295 |
def _pool_or_gap(outputs) -> torch.Tensor:
|
| 296 |
# Some transformers vision CNNs provide pooler_output explicitly.
|
| 297 |
+
# 일부 transformers vision CNN은 pooler_output을 명시적으로 제공.
|
| 298 |
if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
|
| 299 |
x = outputs.pooler_output
|
| 300 |
if x.dim() == 2:
|
|
|
|
| 304 |
raise RuntimeError(f"Unexpected pooler_output shape: {tuple(x.shape)}")
|
| 305 |
|
| 306 |
# Otherwise we expect a CNN-style last_hidden_state=(B,C,H,W) and apply GAP.
|
| 307 |
+
# 그렇지 않으면 CNN 스타일 last_hidden_state=(B,C,H,W)를 기대하고 GAP을 적용.
|
| 308 |
x = outputs.last_hidden_state
|
| 309 |
if x.dim() == 4:
|
| 310 |
return x.mean(dim=(2, 3))
|
|
|
|
| 316 |
|
| 317 |
def _extract_features(self, outputs, pixel_values: Optional[torch.Tensor] = None) -> torch.Tensor:
|
| 318 |
# Feature rule is defined by BACKBONE_META and must remain stable across saves/loads.
|
| 319 |
+
# feature 규칙은 BACKBONE_META로 정의되며 저장/로드 간 안정적 동작을 위해 제한된 모델만 사용.
|
| 320 |
rule = self._meta["feat_rule"]
|
| 321 |
|
| 322 |
if rule == "cls":
|
| 323 |
# ViT-style: use CLS token embedding from last_hidden_state.
|
| 324 |
+
# ViT 스타일: last_hidden_state에서 CLS 토큰 임베딩을 사용.
|
| 325 |
return outputs.last_hidden_state[:, 0, :]
|
| 326 |
|
| 327 |
if rule == "pool_or_mean":
|
| 328 |
# Swin-style: prefer pooler_output if present, else mean-pool over tokens.
|
| 329 |
+
# Swin 스타일: pooler_output이 있으면 우선 사용하고, 없으면 토큰 평균 풀링을 사용.
|
| 330 |
if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
|
| 331 |
return outputs.pooler_output
|
| 332 |
return outputs.last_hidden_state.mean(dim=1)
|
| 333 |
|
| 334 |
if rule == "pool_or_gap":
|
| 335 |
# CNN-style: use pooler_output if present, else GAP over spatial dims.
|
| 336 |
+
# CNN 스타일: pooler_output이 있으면 사용하고, 없으면 공간 차원 GAP을 사용.
|
| 337 |
return self._pool_or_gap(outputs)
|
| 338 |
|
| 339 |
if rule == "timm_gap":
|
| 340 |
# timm forward_features returns a feature map (B,C,H,W) which we GAP to (B,C).
|
| 341 |
+
# timm forward_features는 (B,C,H,W) feature map을 반환하며 이를 GAP으로 (B,C)로 변환.
|
| 342 |
if not isinstance(outputs, torch.Tensor):
|
| 343 |
raise TypeError(f"timm_gap expects Tensor features, got {type(outputs)}")
|
| 344 |
if outputs.dim() != 4:
|
|
|
|
| 347 |
|
| 348 |
if rule == "torchvision_densenet_gap":
|
| 349 |
# torchvision DenseNet features are feature maps (B,C,H,W) and require GAP.
|
| 350 |
+
# torchvision DenseNet features는 (B,C,H,W) feature map이며 GAP이 필요.
|
| 351 |
if not isinstance(outputs, torch.Tensor):
|
| 352 |
raise TypeError(f"torchvision_densenet_gap expects Tensor, got {type(outputs)}")
|
| 353 |
if outputs.dim() != 4:
|
|
|
|
| 366 |
**kwargs,
|
| 367 |
):
|
| 368 |
# Type decides the backbone forward path and output format.
|
| 369 |
+
# type이 backbone forward 경로 및 출력 포맷을 결정.
|
| 370 |
t = self._meta["type"]
|
| 371 |
|
| 372 |
if t == "timm_densenet":
|
|
|
|
| 398 |
|
| 399 |
else:
|
| 400 |
# Transformers vision models are called with pixel_values and return ModelOutput.
|
| 401 |
+
# transformers vision 모델은 pixel_values로 호출되며 ModelOutput을 반환.
|
| 402 |
outputs = self.backbone(
|
| 403 |
pixel_values=pixel_values,
|
| 404 |
output_attentions=output_attentions,
|
|
|
|
| 411 |
attentions = getattr(outputs, "attentions", None)
|
| 412 |
|
| 413 |
# Classifier consumes (B, feat_dim) and returns logits (B, num_labels).
|
| 414 |
+
# classifier는 (B, feat_dim)을 받아 logits (B, num_labels)를 반환.
|
| 415 |
logits = self.classifier(feats)
|
| 416 |
|
| 417 |
loss = None
|
| 418 |
if labels is not None:
|
| 419 |
# Cross entropy expects labels as class indices in [0, num_labels).
|
| 420 |
+
# cross entropy는 labels가 [0, num_labels) 범위의 class index이길 기대함.
|
| 421 |
loss = F.cross_entropy(logits, labels)
|
| 422 |
|
| 423 |
if not return_dict:
|
|
|
|
| 438 |
# ============================================================
|
| 439 |
def _set_requires_grad(module: nn.Module, flag: bool):
|
| 440 |
# Toggle requires_grad for all parameters in a module.
|
| 441 |
+
# 모듈의 모든 파라미터에 대해 requires_grad를 토글.
|
| 442 |
for p in module.parameters():
|
| 443 |
p.requires_grad = flag
|
| 444 |
|
| 445 |
|
| 446 |
def set_bn_eval(module: nn.Module):
|
| 447 |
# Put BatchNorm layers into eval mode to freeze running stats.
|
| 448 |
+
# BatchNorm 레이어를 eval 모드로 두어 running stats를 고정.
|
| 449 |
for m in module.modules():
|
| 450 |
if isinstance(m, (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.SyncBatchNorm)):
|
| 451 |
m.eval()
|
|
|
|
| 453 |
|
| 454 |
def freeze_backbone(model: BackboneWithMLPHeadForImageClassification, freeze_bn: bool = True):
|
| 455 |
# Stage1: freeze backbone and train only the head.
|
| 456 |
+
# stage1: backbone을 freeze하고 head만 학습.
|
| 457 |
_set_requires_grad(model.backbone, False)
|
| 458 |
_set_requires_grad(model.classifier, True)
|
| 459 |
|
|
|
|
| 464 |
|
| 465 |
def finetune_train_mode(model: BackboneWithMLPHeadForImageClassification, keep_bn_eval: bool = True):
|
| 466 |
# Stage2: train mode, optionally keeping BN layers in eval for stability.
|
| 467 |
+
# stage2: train 모드로 두되 안정성을 위해 BN을 eval로 유지할 수 있음. (buffer 등을 유지하기 위해)
|
| 468 |
model.train()
|
| 469 |
meta = getattr(model, "_meta", None) or getattr(model.config, "backbone_meta", None)
|
| 470 |
if keep_bn_eval and meta.get("has_bn", False):
|
|
|
|
| 473 |
|
| 474 |
def trainable_summary(model: nn.Module):
|
| 475 |
# Print a compact summary of trainable parameters.
|
| 476 |
+
# 학습 가능 파라미터 요약을 간단히 출력.
|
| 477 |
total = sum(p.numel() for p in model.parameters())
|
| 478 |
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
| 479 |
ratio = trainable / total if total > 0 else 0.0
|
|
|
|
| 487 |
keep_bn_eval: bool = True,
|
| 488 |
):
|
| 489 |
# This utility implements BACKBONE_META['unfreeze']=="last_n" across supported backbones.
|
| 490 |
+
# 이 유틸은 지원 백본들에 대해 BACKBONE_META['unfreeze']=="last_n"을 구현.
|
| 491 |
freeze_backbone(model, freeze_bn=keep_bn_eval)
|
| 492 |
|
| 493 |
n = int(last_n)
|
|
|
|
| 502 |
|
| 503 |
if bb_type == "vit":
|
| 504 |
# ViT blocks live under backbone.encoder.layer in the transformers implementation.
|
| 505 |
+
# ViT 블록은 transformers 구현에서 backbone.encoder.layer 아래에 존재함.
|
| 506 |
blocks = list(model.backbone.encoder.layer)
|
| 507 |
for blk in blocks[-n:]:
|
| 508 |
_set_requires_grad(blk, True)
|
|
|
|
| 510 |
|
| 511 |
if bb_type == "swin":
|
| 512 |
# Swin blocks are nested by stages and blocks; we flatten and unfreeze last n blocks.
|
| 513 |
+
# Swin 블록은 stage와 block으로 중첩되어 있어 펼친 후 마지막 n개를 unfreeze.
|
| 514 |
stages = list(model.backbone.encoder.layers)
|
| 515 |
blocks: List[nn.Module] = []
|
| 516 |
for st in stages:
|
|
|
|
| 521 |
|
| 522 |
if bb_type == "resnet":
|
| 523 |
# ResNet uses layer1..layer4 stages; we unfreeze at block granularity.
|
| 524 |
+
# ResNet은 layer1..layer4 stage를 사용하며 block 단위로 unfreeze.
|
| 525 |
bb = model.backbone
|
| 526 |
for name in ("layer1", "layer2", "layer3", "layer4"):
|
| 527 |
if not hasattr(bb, name):
|
|
|
|
| 542 |
|
| 543 |
if bb_type == "efficientnet":
|
| 544 |
# EfficientNet in transformers exposes features; we unfreeze from the tail blocks.
|
| 545 |
+
# transformers EfficientNet은 features를 노출하며 뒤쪽 블록부터 unfreeze.
|
| 546 |
bb = model.backbone
|
| 547 |
if not hasattr(bb, "features"):
|
| 548 |
raise RuntimeError("Unexpected EfficientNet structure: missing features")
|
|
|
|
| 560 |
|
| 561 |
if bb_type in ("timm_densenet", "torchvision_densenet"):
|
| 562 |
# DenseNet exposes a .features module with named blocks; we unfreeze last n submodules.
|
| 563 |
+
# DenseNet은 .features 모듈에 블록들이 이름으로 존재하며 마지막 n개 서브모듈을 unfreeze.
|
| 564 |
bb = model.backbone
|
| 565 |
if not hasattr(bb, "features"):
|
| 566 |
raise RuntimeError("Unexpected DenseNet: missing features")
|
|
|
|
| 579 |
|
| 580 |
def _denselayers(db: nn.Module) -> List[nn.Module]:
|
| 581 |
# Dense blocks contain multiple DenseLayer children; we return them for fine-grained unfreezing.
|
| 582 |
+
# denseblock은 DenseLayer 자식들을 가지므로 세밀한 unfreeze를 위해 이를 반환.
|
| 583 |
return list(db.children())
|
| 584 |
|
| 585 |
blocks: List[nn.Module] = []
|
|
|
|
| 604 |
# register
|
| 605 |
# -------------------------
|
| 606 |
# Register for AutoModelForImageClassification so from_pretrained can resolve this custom class.
|
| 607 |
+
# from_pretrained가 이 커스텀 클래스를 해석할 수 있도록 AutoModelForImageClassification에 등록.
|
| 608 |
BackboneWithMLPHeadForImageClassification.register_for_auto_class("AutoModelForImageClassification")
|
ds_proc.py
CHANGED
|
@@ -4,8 +4,8 @@
|
|
| 4 |
# src/ds_proc.py
|
| 5 |
|
| 6 |
# ============================================================
|
| 7 |
-
#
|
| 8 |
-
#
|
| 9 |
# ============================================================
|
| 10 |
|
| 11 |
from typing import Any
|
|
@@ -27,41 +27,38 @@ except ImportError:
|
|
| 27 |
class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
| 28 |
"""
|
| 29 |
This processor performs image preprocessing and outputs {"pixel_values": ...}.
|
| 30 |
-
이 processor는 이미지 전처리를 수행하고 {"pixel_values": ...}를
|
| 31 |
|
| 32 |
Key requirements:
|
| 33 |
핵심 요구사항:
|
| 34 |
|
| 35 |
1) save_pretrained() must produce a JSON-serializable preprocessor_config.json.
|
| 36 |
-
save_pretrained()는 JSON 직렬화 가능한 preprocessor_config.json을 생성해야
|
| 37 |
-
|
| 38 |
2) Runtime-only objects (delegate processor, timm/torchvision transforms) must NOT be serialized.
|
| 39 |
-
런타임 객체(delegate processor, timm/torchvision transform)는 절대 직렬화하면 안
|
| 40 |
-
|
| 41 |
3) Runtime objects are rebuilt at init/load time based on backbone meta.
|
| 42 |
-
런타임 객체는 backbone meta에 따라 init/load 시점에
|
| 43 |
-
|
| 44 |
4) For reproducibility, use_fast must be explicitly persisted and honored on load.
|
| 45 |
-
재현성을 위해 use_fast는 명시적으로 저장되고, 로드시 반드시 반영되어야
|
| 46 |
"""
|
| 47 |
|
| 48 |
# HF vision models conventionally expect "pixel_values" as the primary input key.
|
| 49 |
-
# HF vision 모델은 관례적으로 입력 키로 "pixel_values"를
|
| 50 |
model_input_names = ["pixel_values"]
|
| 51 |
|
| 52 |
def __init__(
|
| 53 |
self,
|
| 54 |
backbone_name_or_path: BackboneID,
|
| 55 |
-
is_training: bool = False,
|
| 56 |
use_fast: bool = False,
|
| 57 |
**kwargs,
|
| 58 |
):
|
| 59 |
# ImageProcessingMixin stores extra kwargs and manages auto_map metadata.
|
| 60 |
-
# ImageProcessingMixin은 추가 kwargs를 저장하고 auto_map 메타를
|
| 61 |
super().__init__(**kwargs)
|
| 62 |
|
| 63 |
# Enforce whitelist via BACKBONE_META to keep behavior stable.
|
| 64 |
-
# 동작 안정성을 위해 BACKBONE_META 기반 화이트리스트를
|
| 65 |
if backbone_name_or_path not in BACKBONE_META:
|
| 66 |
raise ValueError(
|
| 67 |
f"Unsupported backbone_name_or_path={backbone_name_or_path}. "
|
|
@@ -69,23 +66,23 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 69 |
)
|
| 70 |
|
| 71 |
# Serializable fields only: these should appear in preprocessor_config.json.
|
| 72 |
-
# 직렬화 가능한 필드만: 이 값들만 preprocessor_config.json에 들어가야
|
| 73 |
self.backbone_name_or_path = backbone_name_or_path
|
| 74 |
self.is_training = bool(is_training)
|
| 75 |
|
| 76 |
# Reproducibility switch for transformers processors.
|
| 77 |
-
# transformers processor의 fast/slow 선택을 재현 가능하게
|
| 78 |
self.use_fast = bool(use_fast)
|
| 79 |
|
| 80 |
# Runtime-only fields: must never be serialized.
|
| 81 |
-
# 런타임 전용 필드: 절대 직렬화되면 안
|
| 82 |
self._meta = None
|
| 83 |
-
self._delegate
|
| 84 |
-
self._timm_transform
|
| 85 |
self._torchvision_transform = None
|
| 86 |
|
| 87 |
# Build runtime objects according to backbone type.
|
| 88 |
-
# backbone type에 따라 런타임 객체를
|
| 89 |
self._build_runtime()
|
| 90 |
|
| 91 |
# ============================================================
|
|
@@ -95,13 +92,13 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 95 |
def _build_runtime(self):
|
| 96 |
"""
|
| 97 |
Build runtime delegate/transform based on BACKBONE_META["type"].
|
| 98 |
-
BACKBONE_META["type"]에 따라 런타임 delegate/transform을
|
| 99 |
"""
|
| 100 |
meta = BACKBONE_META[self.backbone_name_or_path]
|
| 101 |
self._meta = meta
|
| 102 |
|
| 103 |
# Always reset runtime fields before rebuilding.
|
| 104 |
-
# 재구성 전 런타임 필드는 항상
|
| 105 |
self._delegate = None
|
| 106 |
self._timm_transform = None
|
| 107 |
self._torchvision_transform = None
|
|
@@ -110,7 +107,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 110 |
|
| 111 |
if t == "timm_densenet":
|
| 112 |
# timm DenseNet uses timm.data transforms for ImageNet-style preprocessing.
|
| 113 |
-
# timm DenseNet은 ImageNet 전처리를 위해 timm.data transform을
|
| 114 |
self._timm_transform = self._build_timm_transform(
|
| 115 |
backbone_id=self.backbone_name_or_path,
|
| 116 |
is_training=self.is_training,
|
|
@@ -119,17 +116,17 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 119 |
|
| 120 |
if t == "torchvision_densenet":
|
| 121 |
# torchvision DenseNet requires torchvision-style preprocessing (resize/crop/tensor/normalize).
|
| 122 |
-
# torchvision DenseNet은 torchvision 스타일 전처리(resize/crop/tensor/normalize)가
|
| 123 |
self._torchvision_transform = self._build_torchvision_densenet_transform(
|
| 124 |
is_training=self.is_training
|
| 125 |
)
|
| 126 |
return
|
| 127 |
|
| 128 |
# Default: transformers backbone delegates to its official AutoImageProcessor.
|
| 129 |
-
# 기본: transformers 백본은 공식 AutoImageProcessor에
|
| 130 |
#
|
| 131 |
# IMPORTANT:
|
| 132 |
-
# - use_fast는 transformers 기본값 변경에 흔들리지 않도록 반드시 명시적으로
|
| 133 |
self._delegate = AutoImageProcessor.from_pretrained(
|
| 134 |
self.backbone_name_or_path,
|
| 135 |
use_fast=self.use_fast,
|
|
@@ -140,7 +137,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 140 |
def _build_timm_transform(*, backbone_id: str, is_training: bool):
|
| 141 |
"""
|
| 142 |
Create timm transform without storing non-serializable objects in config.
|
| 143 |
-
비직렬화 객체를 config에 저장하지 않고 timm transform을
|
| 144 |
"""
|
| 145 |
try:
|
| 146 |
import timm
|
|
@@ -151,20 +148,20 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 151 |
) from e
|
| 152 |
|
| 153 |
# We only need model metadata to resolve data config, so pretrained=False is preferred.
|
| 154 |
-
# data config 추출만 필요하므로 pretrained=False를 우선
|
| 155 |
m = timm.create_model(f"hf_hub:{backbone_id}", pretrained=False, num_classes=0)
|
| 156 |
dc = resolve_model_data_config(m)
|
| 157 |
|
| 158 |
# create_transform returns a torchvision-like callable that maps PIL -> torch.Tensor(C,H,W).
|
| 159 |
-
# create_transform은 PIL -> torch.Tensor(C,H,W)로 매핑하는 callable을
|
| 160 |
-
tfm = create_transform(**dc, is_training=is_training)
|
| 161 |
return tfm
|
| 162 |
|
| 163 |
@staticmethod
|
| 164 |
def _build_torchvision_densenet_transform(*, is_training: bool):
|
| 165 |
"""
|
| 166 |
Build torchvision preprocessing for DenseNet-121 (224 pipeline).
|
| 167 |
-
DenseNet-121용 torchvision 전처리(224 파이프라인)를
|
| 168 |
"""
|
| 169 |
try:
|
| 170 |
from torchvision import transforms
|
|
@@ -174,28 +171,29 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 174 |
) from e
|
| 175 |
|
| 176 |
# These are the standard ImageNet normalization stats used by torchvision weights.
|
| 177 |
-
# 이 값들은 torchvision weights가 사용하는 표준 ImageNet 정규화
|
| 178 |
mean = (0.485, 0.456, 0.406)
|
| 179 |
-
std
|
| 180 |
|
| 181 |
# Training pipeline typically uses RandomResizedCrop and horizontal flip.
|
| 182 |
-
# 학습 파이프라인은 보통 RandomResizedCrop과 좌우반전을
|
| 183 |
if is_training:
|
| 184 |
return transforms.Compose(
|
| 185 |
[
|
| 186 |
-
transforms.RandomResizedCrop(224),
|
| 187 |
-
transforms.RandomHorizontalFlip(p=0.5),
|
|
|
|
| 188 |
transforms.ToTensor(),
|
| 189 |
transforms.Normalize(mean=mean, std=std),
|
| 190 |
]
|
| 191 |
)
|
| 192 |
|
| 193 |
# Inference pipeline typically uses Resize(256) + CenterCrop(224).
|
| 194 |
-
# 추론 파이프라인은 보통 Resize(256) + CenterCrop(224)를
|
| 195 |
return transforms.Compose(
|
| 196 |
[
|
| 197 |
transforms.Resize(256),
|
| 198 |
-
transforms.CenterCrop(224),
|
| 199 |
transforms.ToTensor(),
|
| 200 |
transforms.Normalize(mean=mean, std=std),
|
| 201 |
]
|
|
@@ -208,24 +206,24 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 208 |
def to_dict(self) -> dict[str, Any]:
|
| 209 |
"""
|
| 210 |
Return a JSON-serializable dict for preprocessor_config.json.
|
| 211 |
-
preprocessor_config.json에 들어갈 JSON 직렬화 dict를
|
| 212 |
|
| 213 |
Important: do not leak runtime objects into the serialized dict.
|
| 214 |
-
중요: 런타임 객체가 직렬화 dict에 섞이면 안
|
| 215 |
"""
|
| 216 |
# ImageProcessingMixin.to_dict() adds metadata such as image_processor_type/auto_map.
|
| 217 |
# ImageProcessingMixin.to_dict()는 image_processor_type/auto_map 같은 메타를 추가합니다.
|
| 218 |
d = super().to_dict()
|
| 219 |
|
| 220 |
# Force minimal stable fields for long-term compatibility.
|
| 221 |
-
# 장기 호환을 위해 최소 안정 필드를
|
| 222 |
-
d["image_processor_type"]
|
| 223 |
d["backbone_name_or_path"] = self.backbone_name_or_path
|
| 224 |
d["is_training"] = self.is_training
|
| 225 |
-
d["use_fast"]
|
| 226 |
|
| 227 |
# Remove any runtime-only fields defensively.
|
| 228 |
-
# 런타임 전용 필드는 보수적으로
|
| 229 |
for key in ["_meta", "_delegate", "_timm_transform", "_torchvision_transform"]:
|
| 230 |
d.pop(key, None)
|
| 231 |
|
|
@@ -235,14 +233,14 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 235 |
def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs):
|
| 236 |
"""
|
| 237 |
Standard load path used by BaseImageProcessor / AutoImageProcessor.
|
| 238 |
-
BaseImageProcessor / AutoImageProcessor가 사용하는 표준 로드
|
| 239 |
"""
|
| 240 |
backbone = image_processor_dict.get("backbone_name_or_path", None)
|
| 241 |
if backbone is None:
|
| 242 |
raise ValueError("preprocessor_config.json missing key: backbone_name_or_path")
|
| 243 |
|
| 244 |
is_training = bool(image_processor_dict.get("is_training", False))
|
| 245 |
-
use_fast
|
| 246 |
|
| 247 |
return cls(
|
| 248 |
backbone_name_or_path=backbone,
|
|
@@ -255,20 +253,20 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 255 |
def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
|
| 256 |
"""
|
| 257 |
Fallback path if AutoImageProcessor calls class.from_pretrained directly.
|
| 258 |
-
AutoImageProcessor가 class.from_pretrained를 직접 호출하는 경우를 대비한
|
| 259 |
|
| 260 |
Strategy:
|
| 261 |
전략:
|
| 262 |
|
| 263 |
- Read config.json via AutoConfig and recover backbone_name_or_path.
|
| 264 |
-
AutoConfig로 config.json을 읽고 backbone_name_or_path를
|
| 265 |
"""
|
| 266 |
|
| 267 |
# is_training is runtime-only and should default to False for inference/serving.
|
| 268 |
-
# is_training은 런타임 전용이며 추론/서빙 기본값은 False
|
| 269 |
#
|
| 270 |
# IMPORTANT:
|
| 271 |
-
# - use_fast는 kwargs로 전달될 수 있으므로, 있으면
|
| 272 |
use_fast = bool(kwargs.pop("use_fast", False))
|
| 273 |
|
| 274 |
kwargs.pop("trust_remote_code", None)
|
|
@@ -289,7 +287,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 289 |
@staticmethod
|
| 290 |
def _ensure_list(images: Any) -> list[Any]:
|
| 291 |
# Normalize scalar image input to a list for uniform processing.
|
| 292 |
-
# 단일 입력을 리스트로 정규화하여 동일한 처리 경로를
|
| 293 |
if isinstance(images, (list, tuple)):
|
| 294 |
return list(images)
|
| 295 |
return [images]
|
|
@@ -297,7 +295,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 297 |
@staticmethod
|
| 298 |
def _to_pil_rgb(x: Any):
|
| 299 |
# Convert common image inputs into PIL RGB images.
|
| 300 |
-
# 일반적인 입력을 PIL RGB 이미지로
|
| 301 |
from PIL import Image as PILImage
|
| 302 |
|
| 303 |
if isinstance(x, PILImage.Image):
|
|
@@ -314,17 +312,17 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 314 |
) -> dict[str, Any]:
|
| 315 |
"""
|
| 316 |
Convert images into {"pixel_values": Tensor/ndarray}.
|
| 317 |
-
이미지를 {"pixel_values": Tensor/ndarray}로
|
| 318 |
"""
|
| 319 |
images = self._ensure_list(images)
|
| 320 |
|
| 321 |
# Rebuild runtime if needed (e.g., right after deserialization).
|
| 322 |
-
# 직렬화 복원 직후 등 런타임이 비어있을 수 있으므로
|
| 323 |
if (self._delegate is None) and (self._timm_transform is None) and (self._torchvision_transform is None):
|
| 324 |
self._build_runtime()
|
| 325 |
|
| 326 |
# timm path: PIL -> torch.Tensor(C,H,W) normalized float32.
|
| 327 |
-
# timm 경로: PIL -> torch.Tensor(C,H,W) 정규화 float32
|
| 328 |
if self._timm_transform is not None:
|
| 329 |
pv: list[torch.Tensor] = []
|
| 330 |
for im in images:
|
|
@@ -337,7 +335,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 337 |
return self._format_return(pixel_values, return_tensors)
|
| 338 |
|
| 339 |
# torchvision path: PIL -> torch.Tensor(C,H,W) normalized float32.
|
| 340 |
-
# torchvision 경로: PIL -> torch.Tensor(C,H,W) 정규화 float32
|
| 341 |
if self._torchvision_transform is not None:
|
| 342 |
pv: list[torch.Tensor] = []
|
| 343 |
for im in images:
|
|
@@ -350,7 +348,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 350 |
return self._format_return(pixel_values, return_tensors)
|
| 351 |
|
| 352 |
# transformers delegate path: rely on official processor behavior.
|
| 353 |
-
# transformers 위임 경로: 공식 processor 동작을 그대로
|
| 354 |
if self._delegate is None:
|
| 355 |
raise RuntimeError("Processor runtime not built: delegate is None and no transforms are available.")
|
| 356 |
|
|
@@ -360,7 +358,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 360 |
def _format_return(pixel_values: torch.Tensor, return_tensors: str | TensorType | None) -> dict[str, Any]:
|
| 361 |
"""
|
| 362 |
Format pixel_values according to return_tensors.
|
| 363 |
-
return_tensors에 맞춰 pixel_values 반환 포맷을
|
| 364 |
"""
|
| 365 |
if return_tensors is None or return_tensors in ("pt", TensorType.PYTORCH):
|
| 366 |
return {"pixel_values": pixel_values}
|
|
@@ -370,6 +368,6 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 370 |
|
| 371 |
|
| 372 |
# Register this processor for AutoImageProcessor resolution.
|
| 373 |
-
# AutoImageProcessor 해석을 위해 이 processor를
|
| 374 |
if __name__ != "__main__":
|
| 375 |
BackboneMLPHead224ImageProcessor.register_for_auto_class("AutoImageProcessor")
|
|
|
|
| 4 |
# src/ds_proc.py
|
| 5 |
|
| 6 |
# ============================================================
|
| 7 |
+
# ImageProcessor (AutoImageProcessor integration)
|
| 8 |
+
# ImageProcessor (AutoImageProcessor 연동)
|
| 9 |
# ============================================================
|
| 10 |
|
| 11 |
from typing import Any
|
|
|
|
| 27 |
class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
| 28 |
"""
|
| 29 |
This processor performs image preprocessing and outputs {"pixel_values": ...}.
|
| 30 |
+
이 processor는 이미지 전처리를 수행하고 {"pixel_values": ...}를 반환함.
|
| 31 |
|
| 32 |
Key requirements:
|
| 33 |
핵심 요구사항:
|
| 34 |
|
| 35 |
1) save_pretrained() must produce a JSON-serializable preprocessor_config.json.
|
| 36 |
+
save_pretrained()는 JSON 직렬화 가능한 preprocessor_config.json을 생성해야 함.
|
|
|
|
| 37 |
2) Runtime-only objects (delegate processor, timm/torchvision transforms) must NOT be serialized.
|
| 38 |
+
런타임 객체(delegate processor, timm/torchvision transform)는 절대 직렬화하면 안 됨.
|
|
|
|
| 39 |
3) Runtime objects are rebuilt at init/load time based on backbone meta.
|
| 40 |
+
런타임 객체는 backbone meta에 따라 init/load 시점에 재구성.
|
|
|
|
| 41 |
4) For reproducibility, use_fast must be explicitly persisted and honored on load.
|
| 42 |
+
재현성을 위해 use_fast는 명시적으로 저장되고, 로드시 반드시 반영되어야 함.
|
| 43 |
"""
|
| 44 |
|
| 45 |
# HF vision models conventionally expect "pixel_values" as the primary input key.
|
| 46 |
+
# HF vision 모델은 관례적으로 입력 키로 "pixel_values"를 기대.
|
| 47 |
model_input_names = ["pixel_values"]
|
| 48 |
|
| 49 |
def __init__(
|
| 50 |
self,
|
| 51 |
backbone_name_or_path: BackboneID,
|
| 52 |
+
is_training: bool = False, # timm 에서 data augmentation 용.
|
| 53 |
use_fast: bool = False,
|
| 54 |
**kwargs,
|
| 55 |
):
|
| 56 |
# ImageProcessingMixin stores extra kwargs and manages auto_map metadata.
|
| 57 |
+
# ImageProcessingMixin은 추가 kwargs를 저장하고 auto_map 메타를 관리.
|
| 58 |
super().__init__(**kwargs)
|
| 59 |
|
| 60 |
# Enforce whitelist via BACKBONE_META to keep behavior stable.
|
| 61 |
+
# 동작 안정성을 위해 BACKBONE_META 기반 화이트리스트를 강제. - fast fail
|
| 62 |
if backbone_name_or_path not in BACKBONE_META:
|
| 63 |
raise ValueError(
|
| 64 |
f"Unsupported backbone_name_or_path={backbone_name_or_path}. "
|
|
|
|
| 66 |
)
|
| 67 |
|
| 68 |
# Serializable fields only: these should appear in preprocessor_config.json.
|
| 69 |
+
# 직렬화 가능한 필드만: 이 값들만 preprocessor_config.json에 들어가야 함
|
| 70 |
self.backbone_name_or_path = backbone_name_or_path
|
| 71 |
self.is_training = bool(is_training)
|
| 72 |
|
| 73 |
# Reproducibility switch for transformers processors.
|
| 74 |
+
# transformers processor의 fast/slow 선택을 재현 가능하게 고정.
|
| 75 |
self.use_fast = bool(use_fast)
|
| 76 |
|
| 77 |
# Runtime-only fields: must never be serialized.
|
| 78 |
+
# 런타임 전용 필드: 절대 직렬화되면 안 됨.
|
| 79 |
self._meta = None
|
| 80 |
+
self._delegate = None
|
| 81 |
+
self._timm_transform = None
|
| 82 |
self._torchvision_transform = None
|
| 83 |
|
| 84 |
# Build runtime objects according to backbone type.
|
| 85 |
+
# backbone type에 따라 런타임 객체를 구성.
|
| 86 |
self._build_runtime()
|
| 87 |
|
| 88 |
# ============================================================
|
|
|
|
| 92 |
def _build_runtime(self):
|
| 93 |
"""
|
| 94 |
Build runtime delegate/transform based on BACKBONE_META["type"].
|
| 95 |
+
BACKBONE_META["type"]에 따라 런타임 delegate/transform을 구성.
|
| 96 |
"""
|
| 97 |
meta = BACKBONE_META[self.backbone_name_or_path]
|
| 98 |
self._meta = meta
|
| 99 |
|
| 100 |
# Always reset runtime fields before rebuilding.
|
| 101 |
+
# 재구성 전 런타임 필드는 항상 초기화.
|
| 102 |
self._delegate = None
|
| 103 |
self._timm_transform = None
|
| 104 |
self._torchvision_transform = None
|
|
|
|
| 107 |
|
| 108 |
if t == "timm_densenet":
|
| 109 |
# timm DenseNet uses timm.data transforms for ImageNet-style preprocessing.
|
| 110 |
+
# timm DenseNet은 ImageNet 전처리를 위해 timm.data transform을 사용.
|
| 111 |
self._timm_transform = self._build_timm_transform(
|
| 112 |
backbone_id=self.backbone_name_or_path,
|
| 113 |
is_training=self.is_training,
|
|
|
|
| 116 |
|
| 117 |
if t == "torchvision_densenet":
|
| 118 |
# torchvision DenseNet requires torchvision-style preprocessing (resize/crop/tensor/normalize).
|
| 119 |
+
# torchvision DenseNet은 torchvision 스타일 전처리(resize/crop/tensor/normalize)가 필요.
|
| 120 |
self._torchvision_transform = self._build_torchvision_densenet_transform(
|
| 121 |
is_training=self.is_training
|
| 122 |
)
|
| 123 |
return
|
| 124 |
|
| 125 |
# Default: transformers backbone delegates to its official AutoImageProcessor.
|
| 126 |
+
# 기본: transformers 백본은 공식 AutoImageProcessor에 위임.
|
| 127 |
#
|
| 128 |
# IMPORTANT:
|
| 129 |
+
# - use_fast는 transformers 기본값 변경에 흔들리지 않도록 반드시 명시적으로 전달.
|
| 130 |
self._delegate = AutoImageProcessor.from_pretrained(
|
| 131 |
self.backbone_name_or_path,
|
| 132 |
use_fast=self.use_fast,
|
|
|
|
| 137 |
def _build_timm_transform(*, backbone_id: str, is_training: bool):
|
| 138 |
"""
|
| 139 |
Create timm transform without storing non-serializable objects in config.
|
| 140 |
+
비직렬화 객체를 config에 저장하지 않고 timm transform을 생성.
|
| 141 |
"""
|
| 142 |
try:
|
| 143 |
import timm
|
|
|
|
| 148 |
) from e
|
| 149 |
|
| 150 |
# We only need model metadata to resolve data config, so pretrained=False is preferred.
|
| 151 |
+
# data config 추출만 필요하므로 pretrained=False를 우선 사용.
|
| 152 |
m = timm.create_model(f"hf_hub:{backbone_id}", pretrained=False, num_classes=0)
|
| 153 |
dc = resolve_model_data_config(m)
|
| 154 |
|
| 155 |
# create_transform returns a torchvision-like callable that maps PIL -> torch.Tensor(C,H,W).
|
| 156 |
+
# create_transform은 PIL -> torch.Tensor(C,H,W)로 매핑하는 callable을 반환.
|
| 157 |
+
tfm = create_transform(**dc, is_training=is_training) # is_training :Data Aug.
|
| 158 |
return tfm
|
| 159 |
|
| 160 |
@staticmethod
|
| 161 |
def _build_torchvision_densenet_transform(*, is_training: bool):
|
| 162 |
"""
|
| 163 |
Build torchvision preprocessing for DenseNet-121 (224 pipeline).
|
| 164 |
+
DenseNet-121용 torchvision 전처리(224 파이프라인)를 구성.
|
| 165 |
"""
|
| 166 |
try:
|
| 167 |
from torchvision import transforms
|
|
|
|
| 171 |
) from e
|
| 172 |
|
| 173 |
# These are the standard ImageNet normalization stats used by torchvision weights.
|
| 174 |
+
# 이 값들은 torchvision weights가 사용하는 표준 ImageNet 정규화 통계.
|
| 175 |
mean = (0.485, 0.456, 0.406)
|
| 176 |
+
std = (0.229, 0.224, 0.225)
|
| 177 |
|
| 178 |
# Training pipeline typically uses RandomResizedCrop and horizontal flip.
|
| 179 |
+
# 학습 파이프라인은 보통 RandomResizedCrop과 좌우반전을 사용.
|
| 180 |
if is_training:
|
| 181 |
return transforms.Compose(
|
| 182 |
[
|
| 183 |
+
# transforms.RandomResizedCrop(224),
|
| 184 |
+
# transforms.RandomHorizontalFlip(p=0.5),
|
| 185 |
+
transforms.Resize(224),
|
| 186 |
transforms.ToTensor(),
|
| 187 |
transforms.Normalize(mean=mean, std=std),
|
| 188 |
]
|
| 189 |
)
|
| 190 |
|
| 191 |
# Inference pipeline typically uses Resize(256) + CenterCrop(224).
|
| 192 |
+
# 추론 파이프라인은 보통 Resize(256) + CenterCrop(224)를 사용.
|
| 193 |
return transforms.Compose(
|
| 194 |
[
|
| 195 |
transforms.Resize(256),
|
| 196 |
+
# transforms.CenterCrop(224),
|
| 197 |
transforms.ToTensor(),
|
| 198 |
transforms.Normalize(mean=mean, std=std),
|
| 199 |
]
|
|
|
|
| 206 |
def to_dict(self) -> dict[str, Any]:
|
| 207 |
"""
|
| 208 |
Return a JSON-serializable dict for preprocessor_config.json.
|
| 209 |
+
preprocessor_config.json에 들어갈 JSON 직렬화 dict를 반환.
|
| 210 |
|
| 211 |
Important: do not leak runtime objects into the serialized dict.
|
| 212 |
+
중요: 런타임 객체가 직렬화 dict에 섞이면 안 됨.
|
| 213 |
"""
|
| 214 |
# ImageProcessingMixin.to_dict() adds metadata such as image_processor_type/auto_map.
|
| 215 |
# ImageProcessingMixin.to_dict()는 image_processor_type/auto_map 같은 메타를 추가합니다.
|
| 216 |
d = super().to_dict()
|
| 217 |
|
| 218 |
# Force minimal stable fields for long-term compatibility.
|
| 219 |
+
# 장기 호환을 위해 최소 안정 필드를 강제로 지정.
|
| 220 |
+
d["image_processor_type"] = self.__class__.__name__
|
| 221 |
d["backbone_name_or_path"] = self.backbone_name_or_path
|
| 222 |
d["is_training"] = self.is_training
|
| 223 |
+
d["use_fast"] = self.use_fast
|
| 224 |
|
| 225 |
# Remove any runtime-only fields defensively.
|
| 226 |
+
# 런타임 전용 필드는 보수적으로 제거.
|
| 227 |
for key in ["_meta", "_delegate", "_timm_transform", "_torchvision_transform"]:
|
| 228 |
d.pop(key, None)
|
| 229 |
|
|
|
|
| 233 |
def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs):
|
| 234 |
"""
|
| 235 |
Standard load path used by BaseImageProcessor / AutoImageProcessor.
|
| 236 |
+
BaseImageProcessor / AutoImageProcessor가 사용하는 표준 로드 경로임.
|
| 237 |
"""
|
| 238 |
backbone = image_processor_dict.get("backbone_name_or_path", None)
|
| 239 |
if backbone is None:
|
| 240 |
raise ValueError("preprocessor_config.json missing key: backbone_name_or_path")
|
| 241 |
|
| 242 |
is_training = bool(image_processor_dict.get("is_training", False))
|
| 243 |
+
use_fast = bool(image_processor_dict.get("use_fast", False))
|
| 244 |
|
| 245 |
return cls(
|
| 246 |
backbone_name_or_path=backbone,
|
|
|
|
| 253 |
def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
|
| 254 |
"""
|
| 255 |
Fallback path if AutoImageProcessor calls class.from_pretrained directly.
|
| 256 |
+
AutoImageProcessor가 class.from_pretrained를 직접 호출하는 경우를 대비한 메서드.
|
| 257 |
|
| 258 |
Strategy:
|
| 259 |
전략:
|
| 260 |
|
| 261 |
- Read config.json via AutoConfig and recover backbone_name_or_path.
|
| 262 |
+
AutoConfig로 config.json을 읽고 backbone_name_or_path를 복구.
|
| 263 |
"""
|
| 264 |
|
| 265 |
# is_training is runtime-only and should default to False for inference/serving.
|
| 266 |
+
# is_training은 런타임 전용이며 추론/서빙 기본값은 False 임.
|
| 267 |
#
|
| 268 |
# IMPORTANT:
|
| 269 |
+
# - use_fast는 kwargs로 전달될 수 있으므로, 있으면 반영.
|
| 270 |
use_fast = bool(kwargs.pop("use_fast", False))
|
| 271 |
|
| 272 |
kwargs.pop("trust_remote_code", None)
|
|
|
|
| 287 |
@staticmethod
|
| 288 |
def _ensure_list(images: Any) -> list[Any]:
|
| 289 |
# Normalize scalar image input to a list for uniform processing.
|
| 290 |
+
# 단일 입력을 리스트로 정규화하여 동일한 처리 경로를 사용.
|
| 291 |
if isinstance(images, (list, tuple)):
|
| 292 |
return list(images)
|
| 293 |
return [images]
|
|
|
|
| 295 |
@staticmethod
|
| 296 |
def _to_pil_rgb(x: Any):
|
| 297 |
# Convert common image inputs into PIL RGB images.
|
| 298 |
+
# 일반적인 입력을 PIL RGB 이미지로 변환.
|
| 299 |
from PIL import Image as PILImage
|
| 300 |
|
| 301 |
if isinstance(x, PILImage.Image):
|
|
|
|
| 312 |
) -> dict[str, Any]:
|
| 313 |
"""
|
| 314 |
Convert images into {"pixel_values": Tensor/ndarray}.
|
| 315 |
+
이미지를 {"pixel_values": Tensor/ndarray}로 변환.
|
| 316 |
"""
|
| 317 |
images = self._ensure_list(images)
|
| 318 |
|
| 319 |
# Rebuild runtime if needed (e.g., right after deserialization).
|
| 320 |
+
# 직렬화 복원 직후 등 런타임이 비어있을 수 있으므로 재구성.
|
| 321 |
if (self._delegate is None) and (self._timm_transform is None) and (self._torchvision_transform is None):
|
| 322 |
self._build_runtime()
|
| 323 |
|
| 324 |
# timm path: PIL -> torch.Tensor(C,H,W) normalized float32.
|
| 325 |
+
# timm 경로: PIL -> torch.Tensor(C,H,W) 정규화 float32.
|
| 326 |
if self._timm_transform is not None:
|
| 327 |
pv: list[torch.Tensor] = []
|
| 328 |
for im in images:
|
|
|
|
| 335 |
return self._format_return(pixel_values, return_tensors)
|
| 336 |
|
| 337 |
# torchvision path: PIL -> torch.Tensor(C,H,W) normalized float32.
|
| 338 |
+
# torchvision 경로: PIL -> torch.Tensor(C,H,W) 정규화 float32.
|
| 339 |
if self._torchvision_transform is not None:
|
| 340 |
pv: list[torch.Tensor] = []
|
| 341 |
for im in images:
|
|
|
|
| 348 |
return self._format_return(pixel_values, return_tensors)
|
| 349 |
|
| 350 |
# transformers delegate path: rely on official processor behavior.
|
| 351 |
+
# transformers 위임 경로: 공식 processor 동작을 그대로 사용.
|
| 352 |
if self._delegate is None:
|
| 353 |
raise RuntimeError("Processor runtime not built: delegate is None and no transforms are available.")
|
| 354 |
|
|
|
|
| 358 |
def _format_return(pixel_values: torch.Tensor, return_tensors: str | TensorType | None) -> dict[str, Any]:
|
| 359 |
"""
|
| 360 |
Format pixel_values according to return_tensors.
|
| 361 |
+
return_tensors에 맞춰 pixel_values 반환 포맷을 변환.
|
| 362 |
"""
|
| 363 |
if return_tensors is None or return_tensors in ("pt", TensorType.PYTORCH):
|
| 364 |
return {"pixel_values": pixel_values}
|
|
|
|
| 368 |
|
| 369 |
|
| 370 |
# Register this processor for AutoImageProcessor resolution.
|
| 371 |
+
# AutoImageProcessor 해석을 위해 이 processor를 등록.
|
| 372 |
if __name__ != "__main__":
|
| 373 |
BackboneMLPHead224ImageProcessor.register_for_auto_class("AutoImageProcessor")
|
manifest_20260212_202546.json
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"timestamp": "20260212_202546",
|
| 3 |
+
"repo_id": "dsaint31/bb_mlp_224",
|
| 4 |
+
"revision": "main",
|
| 5 |
+
"tag": null,
|
| 6 |
+
"num_labels": 3,
|
| 7 |
+
"build_device": "mps",
|
| 8 |
+
"count": 6,
|
| 9 |
+
"items": [
|
| 10 |
+
{
|
| 11 |
+
"backbone": "google/vit-base-patch16-224",
|
| 12 |
+
"subdir": "models/google__vit-base-patch16-224",
|
| 13 |
+
"dirname": "google__vit-base-patch16-224"
|
| 14 |
+
},
|
| 15 |
+
{
|
| 16 |
+
"backbone": "microsoft/swin-tiny-patch4-window7-224",
|
| 17 |
+
"subdir": "models/microsoft__swin-tiny-patch4-window7-224",
|
| 18 |
+
"dirname": "microsoft__swin-tiny-patch4-window7-224"
|
| 19 |
+
},
|
| 20 |
+
{
|
| 21 |
+
"backbone": "microsoft/resnet-50",
|
| 22 |
+
"subdir": "models/microsoft__resnet-50",
|
| 23 |
+
"dirname": "microsoft__resnet-50"
|
| 24 |
+
},
|
| 25 |
+
{
|
| 26 |
+
"backbone": "google/efficientnet-b0",
|
| 27 |
+
"subdir": "models/google__efficientnet-b0",
|
| 28 |
+
"dirname": "google__efficientnet-b0"
|
| 29 |
+
},
|
| 30 |
+
{
|
| 31 |
+
"backbone": "timm/densenet121.tv_in1k",
|
| 32 |
+
"subdir": "models/timm__densenet121.tv_in1k",
|
| 33 |
+
"dirname": "timm__densenet121.tv_in1k"
|
| 34 |
+
},
|
| 35 |
+
{
|
| 36 |
+
"backbone": "torchvision/densenet121",
|
| 37 |
+
"subdir": "models/torchvision__densenet121",
|
| 38 |
+
"dirname": "torchvision__densenet121"
|
| 39 |
+
}
|
| 40 |
+
],
|
| 41 |
+
"root_code_included": true,
|
| 42 |
+
"root_code_files": [
|
| 43 |
+
"ds_proc.py",
|
| 44 |
+
"ds_model.py",
|
| 45 |
+
"ds_cfg.py"
|
| 46 |
+
],
|
| 47 |
+
"subfolder_code_included": true,
|
| 48 |
+
"subfolder_code_files": [
|
| 49 |
+
"ds_proc.py",
|
| 50 |
+
"ds_model.py",
|
| 51 |
+
"ds_cfg.py"
|
| 52 |
+
]
|
| 53 |
+
}
|
models/google__efficientnet-b0/config.json
CHANGED
|
@@ -24,7 +24,7 @@
|
|
| 24 |
"num_labels": 3,
|
| 25 |
"transformers_version": "5.1.0",
|
| 26 |
"ds_provenance": {
|
| 27 |
-
"created_at": "
|
| 28 |
"repo_id": "dsaint31/bb_mlp_224",
|
| 29 |
"subdir": "models/google__efficientnet-b0",
|
| 30 |
"wrapper_class": "BackboneWithMLPHeadForImageClassification",
|
|
|
|
| 24 |
"num_labels": 3,
|
| 25 |
"transformers_version": "5.1.0",
|
| 26 |
"ds_provenance": {
|
| 27 |
+
"created_at": "20260212_202546",
|
| 28 |
"repo_id": "dsaint31/bb_mlp_224",
|
| 29 |
"subdir": "models/google__efficientnet-b0",
|
| 30 |
"wrapper_class": "BackboneWithMLPHeadForImageClassification",
|
models/google__efficientnet-b0/ds_model.py
CHANGED
|
@@ -94,14 +94,14 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 94 |
|
| 95 |
def __init__(self, config: BackboneMLPHeadConfig):
|
| 96 |
# PreTrainedModel expects a config object and stores it internally.
|
| 97 |
-
# PreTrainedModel은 config 객체를 받아 내부에
|
| 98 |
super().__init__(config)
|
| 99 |
|
| 100 |
# Fail-fast: the model is not meant to be instantiated without a valid backbone id.
|
| 101 |
-
# fail-fast: 유효한 backbone id 없이 모델을 만드는 사용 시나리오는 허용하지
|
| 102 |
#
|
| 103 |
# Note: Transformers may create configs with no args, but models are conventionally created with configs.
|
| 104 |
-
# 참고: Transformers는 config 무인자 생성이 있을 수 있으나, 모델은 관례적으로 config를 받아
|
| 105 |
if config.backbone_name_or_path is None:
|
| 106 |
raise ValueError(
|
| 107 |
"config.backbone_name_or_path is None. "
|
|
@@ -109,10 +109,10 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 109 |
)
|
| 110 |
|
| 111 |
# Fail-fast: training/inference requires a positive number of labels.
|
| 112 |
-
# fail-fast: 학습/추론은 num_labels가 양수여야
|
| 113 |
#
|
| 114 |
# Config may exist in a minimal form for internal serialization paths, but the model should not.
|
| 115 |
-
# config는 내부 직렬화 경로에서 최소 형태로 존재할 수 있으나 모델은
|
| 116 |
if int(getattr(config, "num_labels", 0)) <= 0:
|
| 117 |
raise ValueError(
|
| 118 |
f"config.num_labels must be > 0, got {getattr(config, 'num_labels', None)}. "
|
|
@@ -120,17 +120,17 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 120 |
)
|
| 121 |
|
| 122 |
# Meta is a single source of truth for extraction and fine-tuning rules.
|
| 123 |
-
# meta는 feature 추출 및 미세조정 규칙의 단일
|
| 124 |
-
|
| 125 |
# Prefer config.backbone_meta to keep Hub runtime self-contained.
|
| 126 |
self._meta = _resolve_backbone_meta(config, fallback_table=BACKBONE_META)
|
| 127 |
|
| 128 |
# Backbone skeleton is always created without pretrained weights.
|
| 129 |
-
# backbone skeleton은 항상 pretrained weight 없이
|
| 130 |
self.backbone = self._build_backbone_skeleton(config.backbone_name_or_path)
|
| 131 |
|
| 132 |
# Head shape is driven by meta feat_dim and config.num_labels.
|
| 133 |
-
# head shape은 meta의 feat_dim과 config.num_labels로
|
| 134 |
self.classifier = MLPHead(
|
| 135 |
in_dim=int(self._meta["feat_dim"]),
|
| 136 |
num_labels=int(config.num_labels),
|
|
@@ -139,16 +139,20 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 139 |
)
|
| 140 |
|
| 141 |
# HF initialization hook, but we override init_weights to initialize head-only.
|
| 142 |
-
# HF 초기화 훅이지만 init_weights를 override하여 head만
|
| 143 |
self.post_init()
|
| 144 |
|
| 145 |
def init_weights(self):
|
| 146 |
"""
|
| 147 |
Initialize only the head to avoid touching the backbone skeleton.
|
| 148 |
-
backbone skeleton을 건드리지 않기 위해 head만
|
| 149 |
|
| 150 |
HF's default init may traverse the entire module tree, which is undesirable here.
|
| 151 |
-
HF 기본 init은 전체 모듈 트리를 순회할 수 있어
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
"""
|
| 153 |
if getattr(self, "classifier", None) is not None:
|
| 154 |
self.classifier.apply(self._init_weights)
|
|
@@ -160,7 +164,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 160 |
# ----------------------------
|
| 161 |
def _build_backbone_skeleton(self, backbone_id: str) -> nn.Module:
|
| 162 |
# Meta decides which loader path to use.
|
| 163 |
-
# meta가 어떤 로더 경로를 사용할지
|
| 164 |
meta = self._meta if backbone_id == self.config.backbone_name_or_path else BACKBONE_META.get(backbone_id)
|
| 165 |
if meta is None:
|
| 166 |
raise KeyError(f"Unknown backbone_id={backbone_id}. Provide backbone_meta in config or extend BACKBONE_META.")
|
|
@@ -174,14 +178,14 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 174 |
return self._build_torchvision_densenet_skeleton(backbone_id)
|
| 175 |
|
| 176 |
# For transformers backbones: build a random-weight skeleton from config only.
|
| 177 |
-
# transformers 백본: config로부터 랜덤 초기화 skeleton만
|
| 178 |
bb_cfg = AutoConfig.from_pretrained(backbone_id)
|
| 179 |
return AutoModel.from_config(bb_cfg)
|
| 180 |
|
| 181 |
@staticmethod
|
| 182 |
def _build_timm_densenet_skeleton(hf_repo_id: str) -> nn.Module:
|
| 183 |
# timm is an optional dependency and should be imported lazily.
|
| 184 |
-
# timm은 옵션 의존성이므로 지연 import
|
| 185 |
try:
|
| 186 |
import timm
|
| 187 |
except Exception as e:
|
|
@@ -190,7 +194,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 190 |
) from e
|
| 191 |
|
| 192 |
# Build structure only (pretrained=False) and remove classifier head (num_classes=0).
|
| 193 |
-
# 구조만 생성(pretrained=False)하고 분류기 head는 제거(num_classes=0)
|
| 194 |
return timm.create_model(
|
| 195 |
f"hf_hub:{hf_repo_id}",
|
| 196 |
pretrained=False,
|
|
@@ -200,12 +204,12 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 200 |
@staticmethod
|
| 201 |
def _build_torchvision_densenet_skeleton(model_id: str) -> nn.Module:
|
| 202 |
# This project intentionally supports only torchvision/densenet121 in the 224 whitelist.
|
| 203 |
-
# 이 프로젝트는 224 화이트리스트에서 torchvision/densenet121만 의도적으로
|
| 204 |
if model_id != "torchvision/densenet121":
|
| 205 |
raise ValueError(f"Unsupported torchvision DenseNet id (224 whitelist only): {model_id}")
|
| 206 |
|
| 207 |
# Build structure only (weights=None) to avoid implicit pretrained loading.
|
| 208 |
-
# implicit pretrained 로드를 피하기 위해 구조만 생성(weights=None)
|
| 209 |
m = tv_models.densenet121(weights=None)
|
| 210 |
return m
|
| 211 |
|
|
@@ -222,10 +226,10 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 222 |
):
|
| 223 |
"""
|
| 224 |
Fresh-start only: inject pretrained backbone weights into the skeleton.
|
| 225 |
-
fresh-start 전용: skeleton backbone에 pretrained 가중치를
|
| 226 |
|
| 227 |
Do NOT call this after from_pretrained() because it would overwrite checkpoint weights.
|
| 228 |
-
from_pretrained() 이후 호출하면 체크포인트 가중치를 덮어쓰므로
|
| 229 |
"""
|
| 230 |
bb = self.config.backbone_name_or_path
|
| 231 |
meta = self._meta
|
|
@@ -240,7 +244,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 240 |
return
|
| 241 |
|
| 242 |
# For transformers backbones, load a reference pretrained model and copy weights into our skeleton.
|
| 243 |
-
# transformers 백본은 reference pretrained 모델을 로드한 뒤 skeleton에 가중치를
|
| 244 |
ref = AutoModel.from_pretrained(
|
| 245 |
bb,
|
| 246 |
low_cpu_mem_usage=low_cpu_mem_usage,
|
|
@@ -248,18 +252,18 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 248 |
)
|
| 249 |
|
| 250 |
# strict=False is used to tolerate harmless key differences across minor versions.
|
| 251 |
-
# strict=False는 마이너 버전 차이로 인한 무해한 키 차이를 허용하기 위해
|
| 252 |
self.backbone.load_state_dict(ref.state_dict(), strict=False)
|
| 253 |
del ref
|
| 254 |
|
| 255 |
@torch.no_grad()
|
| 256 |
def _load_timm_pretrained_into_skeleton_(self, hf_repo_id: str):
|
| 257 |
# timm must be present for timm backbones.
|
| 258 |
-
# timm
|
| 259 |
import timm
|
| 260 |
|
| 261 |
# Create a pretrained reference model and copy its weights strictly.
|
| 262 |
-
# pretrained reference 모델을 만들고 가중치를 strict하게
|
| 263 |
ref = timm.create_model(
|
| 264 |
f"hf_hub:{hf_repo_id}",
|
| 265 |
pretrained=True,
|
|
@@ -272,12 +276,12 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 272 |
@torch.no_grad()
|
| 273 |
def _load_torchvision_pretrained_into_skeleton_(self, model_id: str):
|
| 274 |
# This project intentionally supports only torchvision/densenet121 in the 224 whitelist.
|
| 275 |
-
# 이 프로젝트는 224 화이트리스트에서 torchvision/densenet121만
|
| 276 |
if model_id != "torchvision/densenet121":
|
| 277 |
raise ValueError(f"Unsupported torchvision DenseNet id (224 whitelist only): {model_id}")
|
| 278 |
|
| 279 |
# Use torchvision's default pretrained weights for densenet121.
|
| 280 |
-
# torchvision의 densenet121 기본 pretrained weights를
|
| 281 |
ref = tv_models.densenet121(weights=tv_models.DenseNet121_Weights.DEFAULT).eval()
|
| 282 |
|
| 283 |
self.backbone.load_state_dict(ref.state_dict(), strict=True)
|
|
@@ -290,7 +294,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 290 |
@staticmethod
|
| 291 |
def _pool_or_gap(outputs) -> torch.Tensor:
|
| 292 |
# Some transformers vision CNNs provide pooler_output explicitly.
|
| 293 |
-
# 일부 transformers vision CNN은 pooler_output을 명시적으로
|
| 294 |
if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
|
| 295 |
x = outputs.pooler_output
|
| 296 |
if x.dim() == 2:
|
|
@@ -300,7 +304,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 300 |
raise RuntimeError(f"Unexpected pooler_output shape: {tuple(x.shape)}")
|
| 301 |
|
| 302 |
# Otherwise we expect a CNN-style last_hidden_state=(B,C,H,W) and apply GAP.
|
| 303 |
-
# 그렇지 않으면 CNN 스타일 last_hidden_state=(B,C,H,W)를 기대하고 GAP을
|
| 304 |
x = outputs.last_hidden_state
|
| 305 |
if x.dim() == 4:
|
| 306 |
return x.mean(dim=(2, 3))
|
|
@@ -312,29 +316,29 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 312 |
|
| 313 |
def _extract_features(self, outputs, pixel_values: Optional[torch.Tensor] = None) -> torch.Tensor:
|
| 314 |
# Feature rule is defined by BACKBONE_META and must remain stable across saves/loads.
|
| 315 |
-
# feature 규칙은 BACKBONE_META로 정의되며 저장/로드 간
|
| 316 |
rule = self._meta["feat_rule"]
|
| 317 |
|
| 318 |
if rule == "cls":
|
| 319 |
# ViT-style: use CLS token embedding from last_hidden_state.
|
| 320 |
-
# ViT 스타일: last_hidden_state에서 CLS 토큰 임베딩을
|
| 321 |
return outputs.last_hidden_state[:, 0, :]
|
| 322 |
|
| 323 |
if rule == "pool_or_mean":
|
| 324 |
# Swin-style: prefer pooler_output if present, else mean-pool over tokens.
|
| 325 |
-
# Swin 스타일: pooler_output이 있으면 우선 사용하고, 없으면 토큰 평균 풀링을
|
| 326 |
if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
|
| 327 |
return outputs.pooler_output
|
| 328 |
return outputs.last_hidden_state.mean(dim=1)
|
| 329 |
|
| 330 |
if rule == "pool_or_gap":
|
| 331 |
# CNN-style: use pooler_output if present, else GAP over spatial dims.
|
| 332 |
-
# CNN 스타일: pooler_output이 있으면 사용하고, 없으면 공간 차원 GAP을
|
| 333 |
return self._pool_or_gap(outputs)
|
| 334 |
|
| 335 |
if rule == "timm_gap":
|
| 336 |
# timm forward_features returns a feature map (B,C,H,W) which we GAP to (B,C).
|
| 337 |
-
# timm forward_features는 (B,C,H,W) feature map을 반환하며 이를 GAP으로 (B,C)로
|
| 338 |
if not isinstance(outputs, torch.Tensor):
|
| 339 |
raise TypeError(f"timm_gap expects Tensor features, got {type(outputs)}")
|
| 340 |
if outputs.dim() != 4:
|
|
@@ -343,7 +347,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 343 |
|
| 344 |
if rule == "torchvision_densenet_gap":
|
| 345 |
# torchvision DenseNet features are feature maps (B,C,H,W) and require GAP.
|
| 346 |
-
# torchvision DenseNet features는 (B,C,H,W) feature map이며 GAP이
|
| 347 |
if not isinstance(outputs, torch.Tensor):
|
| 348 |
raise TypeError(f"torchvision_densenet_gap expects Tensor, got {type(outputs)}")
|
| 349 |
if outputs.dim() != 4:
|
|
@@ -362,7 +366,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 362 |
**kwargs,
|
| 363 |
):
|
| 364 |
# Type decides the backbone forward path and output format.
|
| 365 |
-
# type이 backbone forward 경로 및 출력 포맷을
|
| 366 |
t = self._meta["type"]
|
| 367 |
|
| 368 |
if t == "timm_densenet":
|
|
@@ -394,7 +398,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 394 |
|
| 395 |
else:
|
| 396 |
# Transformers vision models are called with pixel_values and return ModelOutput.
|
| 397 |
-
# transformers vision 모델은 pixel_values로 호출되며 ModelOutput을
|
| 398 |
outputs = self.backbone(
|
| 399 |
pixel_values=pixel_values,
|
| 400 |
output_attentions=output_attentions,
|
|
@@ -407,13 +411,13 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 407 |
attentions = getattr(outputs, "attentions", None)
|
| 408 |
|
| 409 |
# Classifier consumes (B, feat_dim) and returns logits (B, num_labels).
|
| 410 |
-
# classifier는 (B, feat_dim)을 받아 logits (B, num_labels)를
|
| 411 |
logits = self.classifier(feats)
|
| 412 |
|
| 413 |
loss = None
|
| 414 |
if labels is not None:
|
| 415 |
# Cross entropy expects labels as class indices in [0, num_labels).
|
| 416 |
-
# cross entropy는 labels가 [0, num_labels) 범위의 class index이길
|
| 417 |
loss = F.cross_entropy(logits, labels)
|
| 418 |
|
| 419 |
if not return_dict:
|
|
@@ -434,14 +438,14 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 434 |
# ============================================================
|
| 435 |
def _set_requires_grad(module: nn.Module, flag: bool):
|
| 436 |
# Toggle requires_grad for all parameters in a module.
|
| 437 |
-
# 모듈의 모든 파라미터에 대해 requires_grad를
|
| 438 |
for p in module.parameters():
|
| 439 |
p.requires_grad = flag
|
| 440 |
|
| 441 |
|
| 442 |
def set_bn_eval(module: nn.Module):
|
| 443 |
# Put BatchNorm layers into eval mode to freeze running stats.
|
| 444 |
-
# BatchNorm 레이어를 eval 모드로 두어 running stats를
|
| 445 |
for m in module.modules():
|
| 446 |
if isinstance(m, (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.SyncBatchNorm)):
|
| 447 |
m.eval()
|
|
@@ -449,7 +453,7 @@ def set_bn_eval(module: nn.Module):
|
|
| 449 |
|
| 450 |
def freeze_backbone(model: BackboneWithMLPHeadForImageClassification, freeze_bn: bool = True):
|
| 451 |
# Stage1: freeze backbone and train only the head.
|
| 452 |
-
# stage1: backbone을 freeze하고 head만
|
| 453 |
_set_requires_grad(model.backbone, False)
|
| 454 |
_set_requires_grad(model.classifier, True)
|
| 455 |
|
|
@@ -460,7 +464,7 @@ def freeze_backbone(model: BackboneWithMLPHeadForImageClassification, freeze_bn:
|
|
| 460 |
|
| 461 |
def finetune_train_mode(model: BackboneWithMLPHeadForImageClassification, keep_bn_eval: bool = True):
|
| 462 |
# Stage2: train mode, optionally keeping BN layers in eval for stability.
|
| 463 |
-
# stage2: train 모드로 두되 안정성을 위해 BN을 eval로 유지할 수
|
| 464 |
model.train()
|
| 465 |
meta = getattr(model, "_meta", None) or getattr(model.config, "backbone_meta", None)
|
| 466 |
if keep_bn_eval and meta.get("has_bn", False):
|
|
@@ -469,7 +473,7 @@ def finetune_train_mode(model: BackboneWithMLPHeadForImageClassification, keep_b
|
|
| 469 |
|
| 470 |
def trainable_summary(model: nn.Module):
|
| 471 |
# Print a compact summary of trainable parameters.
|
| 472 |
-
# 학습 가능 파라미터 요약을 간단히
|
| 473 |
total = sum(p.numel() for p in model.parameters())
|
| 474 |
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
| 475 |
ratio = trainable / total if total > 0 else 0.0
|
|
@@ -483,7 +487,7 @@ def unfreeze_last_stage(
|
|
| 483 |
keep_bn_eval: bool = True,
|
| 484 |
):
|
| 485 |
# This utility implements BACKBONE_META['unfreeze']=="last_n" across supported backbones.
|
| 486 |
-
# 이 유틸은 지원 백본들에 대해 BACKBONE_META['unfreeze']=="last_n"을
|
| 487 |
freeze_backbone(model, freeze_bn=keep_bn_eval)
|
| 488 |
|
| 489 |
n = int(last_n)
|
|
@@ -498,7 +502,7 @@ def unfreeze_last_stage(
|
|
| 498 |
|
| 499 |
if bb_type == "vit":
|
| 500 |
# ViT blocks live under backbone.encoder.layer in the transformers implementation.
|
| 501 |
-
# ViT 블록은 transformers 구현에서 backbone.encoder.layer 아래에
|
| 502 |
blocks = list(model.backbone.encoder.layer)
|
| 503 |
for blk in blocks[-n:]:
|
| 504 |
_set_requires_grad(blk, True)
|
|
@@ -506,7 +510,7 @@ def unfreeze_last_stage(
|
|
| 506 |
|
| 507 |
if bb_type == "swin":
|
| 508 |
# Swin blocks are nested by stages and blocks; we flatten and unfreeze last n blocks.
|
| 509 |
-
# Swin 블록은 stage와 block으로 중첩되어 있어 펼친 후 마지막 n개를 unfreeze
|
| 510 |
stages = list(model.backbone.encoder.layers)
|
| 511 |
blocks: List[nn.Module] = []
|
| 512 |
for st in stages:
|
|
@@ -517,7 +521,7 @@ def unfreeze_last_stage(
|
|
| 517 |
|
| 518 |
if bb_type == "resnet":
|
| 519 |
# ResNet uses layer1..layer4 stages; we unfreeze at block granularity.
|
| 520 |
-
# ResNet은 layer1..layer4 stage를 사용하며 block 단위로 unfreeze
|
| 521 |
bb = model.backbone
|
| 522 |
for name in ("layer1", "layer2", "layer3", "layer4"):
|
| 523 |
if not hasattr(bb, name):
|
|
@@ -538,7 +542,7 @@ def unfreeze_last_stage(
|
|
| 538 |
|
| 539 |
if bb_type == "efficientnet":
|
| 540 |
# EfficientNet in transformers exposes features; we unfreeze from the tail blocks.
|
| 541 |
-
# transformers EfficientNet은 features를 노출하며 뒤쪽 블록부터 unfreeze
|
| 542 |
bb = model.backbone
|
| 543 |
if not hasattr(bb, "features"):
|
| 544 |
raise RuntimeError("Unexpected EfficientNet structure: missing features")
|
|
@@ -556,7 +560,7 @@ def unfreeze_last_stage(
|
|
| 556 |
|
| 557 |
if bb_type in ("timm_densenet", "torchvision_densenet"):
|
| 558 |
# DenseNet exposes a .features module with named blocks; we unfreeze last n submodules.
|
| 559 |
-
# DenseNet은 .features 모듈에 블록들이 이름으로 존재하며 마지막 n개 서브모듈을 unfreeze
|
| 560 |
bb = model.backbone
|
| 561 |
if not hasattr(bb, "features"):
|
| 562 |
raise RuntimeError("Unexpected DenseNet: missing features")
|
|
@@ -575,7 +579,7 @@ def unfreeze_last_stage(
|
|
| 575 |
|
| 576 |
def _denselayers(db: nn.Module) -> List[nn.Module]:
|
| 577 |
# Dense blocks contain multiple DenseLayer children; we return them for fine-grained unfreezing.
|
| 578 |
-
# denseblock은 DenseLayer 자식들을 가지므로 세밀한 unfreeze를 위해 이를
|
| 579 |
return list(db.children())
|
| 580 |
|
| 581 |
blocks: List[nn.Module] = []
|
|
@@ -600,5 +604,5 @@ def unfreeze_last_stage(
|
|
| 600 |
# register
|
| 601 |
# -------------------------
|
| 602 |
# Register for AutoModelForImageClassification so from_pretrained can resolve this custom class.
|
| 603 |
-
# from_pretrained가 이 커스텀 클래스를 해석할 수 있도록 AutoModelForImageClassification에
|
| 604 |
BackboneWithMLPHeadForImageClassification.register_for_auto_class("AutoModelForImageClassification")
|
|
|
|
| 94 |
|
| 95 |
def __init__(self, config: BackboneMLPHeadConfig):
|
| 96 |
# PreTrainedModel expects a config object and stores it internally.
|
| 97 |
+
# PreTrainedModel은 config 객체를 받아 내부에 저장함.
|
| 98 |
super().__init__(config)
|
| 99 |
|
| 100 |
# Fail-fast: the model is not meant to be instantiated without a valid backbone id.
|
| 101 |
+
# fail-fast: 유효한 backbone id 없이 모델을 만드는 사용 시나리오는 허용하지 않음 - fast fail.
|
| 102 |
#
|
| 103 |
# Note: Transformers may create configs with no args, but models are conventionally created with configs.
|
| 104 |
+
# 참고: Transformers는 config 무인자 생성이 있을 수 있으나, 모델은 관례적으로 config를 받아 생성.
|
| 105 |
if config.backbone_name_or_path is None:
|
| 106 |
raise ValueError(
|
| 107 |
"config.backbone_name_or_path is None. "
|
|
|
|
| 109 |
)
|
| 110 |
|
| 111 |
# Fail-fast: training/inference requires a positive number of labels.
|
| 112 |
+
# fail-fast: 학습/추론은 num_labels가 양수여야 함.
|
| 113 |
#
|
| 114 |
# Config may exist in a minimal form for internal serialization paths, but the model should not.
|
| 115 |
+
# config는 내부 직렬화 경로에서 최소 형태로 존재할 수 있으나 모델은 해당 없음.
|
| 116 |
if int(getattr(config, "num_labels", 0)) <= 0:
|
| 117 |
raise ValueError(
|
| 118 |
f"config.num_labels must be > 0, got {getattr(config, 'num_labels', None)}. "
|
|
|
|
| 120 |
)
|
| 121 |
|
| 122 |
# Meta is a single source of truth for extraction and fine-tuning rules.
|
| 123 |
+
# meta는 feature 추출 및 미세조정 규칙의 단일 기준.
|
| 124 |
+
# Resolve backbone meta from config (preferred) or fallback table (for backward compatibility).
|
| 125 |
# Prefer config.backbone_meta to keep Hub runtime self-contained.
|
| 126 |
self._meta = _resolve_backbone_meta(config, fallback_table=BACKBONE_META)
|
| 127 |
|
| 128 |
# Backbone skeleton is always created without pretrained weights.
|
| 129 |
+
# backbone skeleton은 항상 pretrained weight 없이 생성.
|
| 130 |
self.backbone = self._build_backbone_skeleton(config.backbone_name_or_path)
|
| 131 |
|
| 132 |
# Head shape is driven by meta feat_dim and config.num_labels.
|
| 133 |
+
# head shape은 meta의 feat_dim과 config.num_labels로 결정.
|
| 134 |
self.classifier = MLPHead(
|
| 135 |
in_dim=int(self._meta["feat_dim"]),
|
| 136 |
num_labels=int(config.num_labels),
|
|
|
|
| 139 |
)
|
| 140 |
|
| 141 |
# HF initialization hook, but we override init_weights to initialize head-only.
|
| 142 |
+
# HF 초기화 훅이지만 init_weights를 override하여 head만 초기화하도록 변경.
|
| 143 |
self.post_init()
|
| 144 |
|
| 145 |
def init_weights(self):
|
| 146 |
"""
|
| 147 |
Initialize only the head to avoid touching the backbone skeleton.
|
| 148 |
+
backbone skeleton을 건드리지 않기 위해 head만 초기화.
|
| 149 |
|
| 150 |
HF's default init may traverse the entire module tree, which is undesirable here.
|
| 151 |
+
HF 기본 init은 전체 모듈 트리를 순회할 수 있어 여기서 그대로 사용하기 부적절.
|
| 152 |
+
|
| 153 |
+
초기 설계에서 __init__ 내부에서 backbone의 가중치 로드를 수행함(편리를 위해).
|
| 154 |
+
이 경우, HF의 post_init()으로 인해 해당 로드가 취소되는 경우가 존재(timm, torchvision 등의 백본).
|
| 155 |
+
때문에 이를 오버라이드 하여 classifier만 초기화 하도록 변경함.
|
| 156 |
"""
|
| 157 |
if getattr(self, "classifier", None) is not None:
|
| 158 |
self.classifier.apply(self._init_weights)
|
|
|
|
| 164 |
# ----------------------------
|
| 165 |
def _build_backbone_skeleton(self, backbone_id: str) -> nn.Module:
|
| 166 |
# Meta decides which loader path to use.
|
| 167 |
+
# meta가 어떤 로더 경로를 사용할지 결정.
|
| 168 |
meta = self._meta if backbone_id == self.config.backbone_name_or_path else BACKBONE_META.get(backbone_id)
|
| 169 |
if meta is None:
|
| 170 |
raise KeyError(f"Unknown backbone_id={backbone_id}. Provide backbone_meta in config or extend BACKBONE_META.")
|
|
|
|
| 178 |
return self._build_torchvision_densenet_skeleton(backbone_id)
|
| 179 |
|
| 180 |
# For transformers backbones: build a random-weight skeleton from config only.
|
| 181 |
+
# transformers 백본: config로부터 랜덤 초기화 skeleton만 생성.
|
| 182 |
bb_cfg = AutoConfig.from_pretrained(backbone_id)
|
| 183 |
return AutoModel.from_config(bb_cfg)
|
| 184 |
|
| 185 |
@staticmethod
|
| 186 |
def _build_timm_densenet_skeleton(hf_repo_id: str) -> nn.Module:
|
| 187 |
# timm is an optional dependency and should be imported lazily.
|
| 188 |
+
# timm은 옵션 의존성이므로 지연 import 수행.
|
| 189 |
try:
|
| 190 |
import timm
|
| 191 |
except Exception as e:
|
|
|
|
| 194 |
) from e
|
| 195 |
|
| 196 |
# Build structure only (pretrained=False) and remove classifier head (num_classes=0).
|
| 197 |
+
# 구조만 생성(pretrained=False)하고 분류기 head는 제거(num_classes=0).
|
| 198 |
return timm.create_model(
|
| 199 |
f"hf_hub:{hf_repo_id}",
|
| 200 |
pretrained=False,
|
|
|
|
| 204 |
@staticmethod
|
| 205 |
def _build_torchvision_densenet_skeleton(model_id: str) -> nn.Module:
|
| 206 |
# This project intentionally supports only torchvision/densenet121 in the 224 whitelist.
|
| 207 |
+
# 이 프로젝트는 224 화이트리스트에서 torchvision/densenet121만 의도적으로 지원.
|
| 208 |
if model_id != "torchvision/densenet121":
|
| 209 |
raise ValueError(f"Unsupported torchvision DenseNet id (224 whitelist only): {model_id}")
|
| 210 |
|
| 211 |
# Build structure only (weights=None) to avoid implicit pretrained loading.
|
| 212 |
+
# implicit pretrained 로드를 피하기 위해 구조만 생성(weights=None).
|
| 213 |
m = tv_models.densenet121(weights=None)
|
| 214 |
return m
|
| 215 |
|
|
|
|
| 226 |
):
|
| 227 |
"""
|
| 228 |
Fresh-start only: inject pretrained backbone weights into the skeleton.
|
| 229 |
+
fresh-start 전용: skeleton backbone에 pretrained 가중치를 주입.
|
| 230 |
|
| 231 |
Do NOT call this after from_pretrained() because it would overwrite checkpoint weights.
|
| 232 |
+
from_pretrained() 이후 호출하면 체크포인트 가중치를 덮어쓰므로 주의할 것.
|
| 233 |
"""
|
| 234 |
bb = self.config.backbone_name_or_path
|
| 235 |
meta = self._meta
|
|
|
|
| 244 |
return
|
| 245 |
|
| 246 |
# For transformers backbones, load a reference pretrained model and copy weights into our skeleton.
|
| 247 |
+
# transformers 백본은 reference pretrained 모델을 로드한 뒤 skeleton에 가중치를 복사.
|
| 248 |
ref = AutoModel.from_pretrained(
|
| 249 |
bb,
|
| 250 |
low_cpu_mem_usage=low_cpu_mem_usage,
|
|
|
|
| 252 |
)
|
| 253 |
|
| 254 |
# strict=False is used to tolerate harmless key differences across minor versions.
|
| 255 |
+
# strict=False는 마이너 버전 차이로 인한 무해한 키 차이를 허용하기 위해 사용.
|
| 256 |
self.backbone.load_state_dict(ref.state_dict(), strict=False)
|
| 257 |
del ref
|
| 258 |
|
| 259 |
@torch.no_grad()
|
| 260 |
def _load_timm_pretrained_into_skeleton_(self, hf_repo_id: str):
|
| 261 |
# timm must be present for timm backbones.
|
| 262 |
+
# timm 백본에��� timm 설치가 필요.
|
| 263 |
import timm
|
| 264 |
|
| 265 |
# Create a pretrained reference model and copy its weights strictly.
|
| 266 |
+
# pretrained reference 모델을 만들고 가중치를 strict하게 복사.
|
| 267 |
ref = timm.create_model(
|
| 268 |
f"hf_hub:{hf_repo_id}",
|
| 269 |
pretrained=True,
|
|
|
|
| 276 |
@torch.no_grad()
|
| 277 |
def _load_torchvision_pretrained_into_skeleton_(self, model_id: str):
|
| 278 |
# This project intentionally supports only torchvision/densenet121 in the 224 whitelist.
|
| 279 |
+
# 이 프로젝트는 224 화이트리스트에서 torchvision/densenet121만 지원.
|
| 280 |
if model_id != "torchvision/densenet121":
|
| 281 |
raise ValueError(f"Unsupported torchvision DenseNet id (224 whitelist only): {model_id}")
|
| 282 |
|
| 283 |
# Use torchvision's default pretrained weights for densenet121.
|
| 284 |
+
# torchvision의 densenet121 기본 pretrained weights를 사용.
|
| 285 |
ref = tv_models.densenet121(weights=tv_models.DenseNet121_Weights.DEFAULT).eval()
|
| 286 |
|
| 287 |
self.backbone.load_state_dict(ref.state_dict(), strict=True)
|
|
|
|
| 294 |
@staticmethod
|
| 295 |
def _pool_or_gap(outputs) -> torch.Tensor:
|
| 296 |
# Some transformers vision CNNs provide pooler_output explicitly.
|
| 297 |
+
# 일부 transformers vision CNN은 pooler_output을 명시적으로 제공.
|
| 298 |
if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
|
| 299 |
x = outputs.pooler_output
|
| 300 |
if x.dim() == 2:
|
|
|
|
| 304 |
raise RuntimeError(f"Unexpected pooler_output shape: {tuple(x.shape)}")
|
| 305 |
|
| 306 |
# Otherwise we expect a CNN-style last_hidden_state=(B,C,H,W) and apply GAP.
|
| 307 |
+
# 그렇지 않으면 CNN 스타일 last_hidden_state=(B,C,H,W)를 기대하고 GAP을 적용.
|
| 308 |
x = outputs.last_hidden_state
|
| 309 |
if x.dim() == 4:
|
| 310 |
return x.mean(dim=(2, 3))
|
|
|
|
| 316 |
|
| 317 |
def _extract_features(self, outputs, pixel_values: Optional[torch.Tensor] = None) -> torch.Tensor:
|
| 318 |
# Feature rule is defined by BACKBONE_META and must remain stable across saves/loads.
|
| 319 |
+
# feature 규칙은 BACKBONE_META로 정의되며 저장/로드 간 안정적 동작을 위해 제한된 모델만 사용.
|
| 320 |
rule = self._meta["feat_rule"]
|
| 321 |
|
| 322 |
if rule == "cls":
|
| 323 |
# ViT-style: use CLS token embedding from last_hidden_state.
|
| 324 |
+
# ViT 스타일: last_hidden_state에서 CLS 토큰 임베딩을 사용.
|
| 325 |
return outputs.last_hidden_state[:, 0, :]
|
| 326 |
|
| 327 |
if rule == "pool_or_mean":
|
| 328 |
# Swin-style: prefer pooler_output if present, else mean-pool over tokens.
|
| 329 |
+
# Swin 스타일: pooler_output이 있으면 우선 사용하고, 없으면 토큰 평균 풀링을 사용.
|
| 330 |
if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
|
| 331 |
return outputs.pooler_output
|
| 332 |
return outputs.last_hidden_state.mean(dim=1)
|
| 333 |
|
| 334 |
if rule == "pool_or_gap":
|
| 335 |
# CNN-style: use pooler_output if present, else GAP over spatial dims.
|
| 336 |
+
# CNN 스타일: pooler_output이 있으면 사용하고, 없으면 공간 차원 GAP을 사용.
|
| 337 |
return self._pool_or_gap(outputs)
|
| 338 |
|
| 339 |
if rule == "timm_gap":
|
| 340 |
# timm forward_features returns a feature map (B,C,H,W) which we GAP to (B,C).
|
| 341 |
+
# timm forward_features는 (B,C,H,W) feature map을 반환하며 이를 GAP으로 (B,C)로 변환.
|
| 342 |
if not isinstance(outputs, torch.Tensor):
|
| 343 |
raise TypeError(f"timm_gap expects Tensor features, got {type(outputs)}")
|
| 344 |
if outputs.dim() != 4:
|
|
|
|
| 347 |
|
| 348 |
if rule == "torchvision_densenet_gap":
|
| 349 |
# torchvision DenseNet features are feature maps (B,C,H,W) and require GAP.
|
| 350 |
+
# torchvision DenseNet features는 (B,C,H,W) feature map이며 GAP이 필요.
|
| 351 |
if not isinstance(outputs, torch.Tensor):
|
| 352 |
raise TypeError(f"torchvision_densenet_gap expects Tensor, got {type(outputs)}")
|
| 353 |
if outputs.dim() != 4:
|
|
|
|
| 366 |
**kwargs,
|
| 367 |
):
|
| 368 |
# Type decides the backbone forward path and output format.
|
| 369 |
+
# type이 backbone forward 경로 및 출력 포맷을 결정.
|
| 370 |
t = self._meta["type"]
|
| 371 |
|
| 372 |
if t == "timm_densenet":
|
|
|
|
| 398 |
|
| 399 |
else:
|
| 400 |
# Transformers vision models are called with pixel_values and return ModelOutput.
|
| 401 |
+
# transformers vision 모델은 pixel_values로 호출되며 ModelOutput을 반환.
|
| 402 |
outputs = self.backbone(
|
| 403 |
pixel_values=pixel_values,
|
| 404 |
output_attentions=output_attentions,
|
|
|
|
| 411 |
attentions = getattr(outputs, "attentions", None)
|
| 412 |
|
| 413 |
# Classifier consumes (B, feat_dim) and returns logits (B, num_labels).
|
| 414 |
+
# classifier는 (B, feat_dim)을 받아 logits (B, num_labels)를 반환.
|
| 415 |
logits = self.classifier(feats)
|
| 416 |
|
| 417 |
loss = None
|
| 418 |
if labels is not None:
|
| 419 |
# Cross entropy expects labels as class indices in [0, num_labels).
|
| 420 |
+
# cross entropy는 labels가 [0, num_labels) 범위의 class index이길 기대함.
|
| 421 |
loss = F.cross_entropy(logits, labels)
|
| 422 |
|
| 423 |
if not return_dict:
|
|
|
|
| 438 |
# ============================================================
|
| 439 |
def _set_requires_grad(module: nn.Module, flag: bool):
|
| 440 |
# Toggle requires_grad for all parameters in a module.
|
| 441 |
+
# 모듈의 모든 파라미터에 대해 requires_grad를 토글.
|
| 442 |
for p in module.parameters():
|
| 443 |
p.requires_grad = flag
|
| 444 |
|
| 445 |
|
| 446 |
def set_bn_eval(module: nn.Module):
|
| 447 |
# Put BatchNorm layers into eval mode to freeze running stats.
|
| 448 |
+
# BatchNorm 레이어를 eval 모드로 두어 running stats를 고정.
|
| 449 |
for m in module.modules():
|
| 450 |
if isinstance(m, (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.SyncBatchNorm)):
|
| 451 |
m.eval()
|
|
|
|
| 453 |
|
| 454 |
def freeze_backbone(model: BackboneWithMLPHeadForImageClassification, freeze_bn: bool = True):
|
| 455 |
# Stage1: freeze backbone and train only the head.
|
| 456 |
+
# stage1: backbone을 freeze하고 head만 학습.
|
| 457 |
_set_requires_grad(model.backbone, False)
|
| 458 |
_set_requires_grad(model.classifier, True)
|
| 459 |
|
|
|
|
| 464 |
|
| 465 |
def finetune_train_mode(model: BackboneWithMLPHeadForImageClassification, keep_bn_eval: bool = True):
|
| 466 |
# Stage2: train mode, optionally keeping BN layers in eval for stability.
|
| 467 |
+
# stage2: train 모드로 두되 안정성을 위해 BN을 eval로 유지할 수 있음. (buffer 등을 유지하기 위해)
|
| 468 |
model.train()
|
| 469 |
meta = getattr(model, "_meta", None) or getattr(model.config, "backbone_meta", None)
|
| 470 |
if keep_bn_eval and meta.get("has_bn", False):
|
|
|
|
| 473 |
|
| 474 |
def trainable_summary(model: nn.Module):
|
| 475 |
# Print a compact summary of trainable parameters.
|
| 476 |
+
# 학습 가능 파라미터 요약을 간단히 출력.
|
| 477 |
total = sum(p.numel() for p in model.parameters())
|
| 478 |
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
| 479 |
ratio = trainable / total if total > 0 else 0.0
|
|
|
|
| 487 |
keep_bn_eval: bool = True,
|
| 488 |
):
|
| 489 |
# This utility implements BACKBONE_META['unfreeze']=="last_n" across supported backbones.
|
| 490 |
+
# 이 유틸은 지원 백본들에 대해 BACKBONE_META['unfreeze']=="last_n"을 구현.
|
| 491 |
freeze_backbone(model, freeze_bn=keep_bn_eval)
|
| 492 |
|
| 493 |
n = int(last_n)
|
|
|
|
| 502 |
|
| 503 |
if bb_type == "vit":
|
| 504 |
# ViT blocks live under backbone.encoder.layer in the transformers implementation.
|
| 505 |
+
# ViT 블록은 transformers 구현에서 backbone.encoder.layer 아래에 존재함.
|
| 506 |
blocks = list(model.backbone.encoder.layer)
|
| 507 |
for blk in blocks[-n:]:
|
| 508 |
_set_requires_grad(blk, True)
|
|
|
|
| 510 |
|
| 511 |
if bb_type == "swin":
|
| 512 |
# Swin blocks are nested by stages and blocks; we flatten and unfreeze last n blocks.
|
| 513 |
+
# Swin 블록은 stage와 block으로 중첩되어 있어 펼친 후 마지막 n개를 unfreeze.
|
| 514 |
stages = list(model.backbone.encoder.layers)
|
| 515 |
blocks: List[nn.Module] = []
|
| 516 |
for st in stages:
|
|
|
|
| 521 |
|
| 522 |
if bb_type == "resnet":
|
| 523 |
# ResNet uses layer1..layer4 stages; we unfreeze at block granularity.
|
| 524 |
+
# ResNet은 layer1..layer4 stage를 사용하며 block 단위로 unfreeze.
|
| 525 |
bb = model.backbone
|
| 526 |
for name in ("layer1", "layer2", "layer3", "layer4"):
|
| 527 |
if not hasattr(bb, name):
|
|
|
|
| 542 |
|
| 543 |
if bb_type == "efficientnet":
|
| 544 |
# EfficientNet in transformers exposes features; we unfreeze from the tail blocks.
|
| 545 |
+
# transformers EfficientNet은 features를 노출하며 뒤쪽 블록부터 unfreeze.
|
| 546 |
bb = model.backbone
|
| 547 |
if not hasattr(bb, "features"):
|
| 548 |
raise RuntimeError("Unexpected EfficientNet structure: missing features")
|
|
|
|
| 560 |
|
| 561 |
if bb_type in ("timm_densenet", "torchvision_densenet"):
|
| 562 |
# DenseNet exposes a .features module with named blocks; we unfreeze last n submodules.
|
| 563 |
+
# DenseNet은 .features 모듈에 블록들이 이름으로 존재하며 마지막 n개 서브모듈을 unfreeze.
|
| 564 |
bb = model.backbone
|
| 565 |
if not hasattr(bb, "features"):
|
| 566 |
raise RuntimeError("Unexpected DenseNet: missing features")
|
|
|
|
| 579 |
|
| 580 |
def _denselayers(db: nn.Module) -> List[nn.Module]:
|
| 581 |
# Dense blocks contain multiple DenseLayer children; we return them for fine-grained unfreezing.
|
| 582 |
+
# denseblock은 DenseLayer 자식들을 가지므로 세밀한 unfreeze를 위해 이를 반환.
|
| 583 |
return list(db.children())
|
| 584 |
|
| 585 |
blocks: List[nn.Module] = []
|
|
|
|
| 604 |
# register
|
| 605 |
# -------------------------
|
| 606 |
# Register for AutoModelForImageClassification so from_pretrained can resolve this custom class.
|
| 607 |
+
# from_pretrained가 이 커스텀 클래스를 해석할 수 있도록 AutoModelForImageClassification에 등록.
|
| 608 |
BackboneWithMLPHeadForImageClassification.register_for_auto_class("AutoModelForImageClassification")
|
models/google__efficientnet-b0/ds_proc.py
CHANGED
|
@@ -4,8 +4,8 @@
|
|
| 4 |
# src/ds_proc.py
|
| 5 |
|
| 6 |
# ============================================================
|
| 7 |
-
#
|
| 8 |
-
#
|
| 9 |
# ============================================================
|
| 10 |
|
| 11 |
from typing import Any
|
|
@@ -27,41 +27,38 @@ except ImportError:
|
|
| 27 |
class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
| 28 |
"""
|
| 29 |
This processor performs image preprocessing and outputs {"pixel_values": ...}.
|
| 30 |
-
이 processor는 이미지 전처리를 수행하고 {"pixel_values": ...}를
|
| 31 |
|
| 32 |
Key requirements:
|
| 33 |
핵심 요구사항:
|
| 34 |
|
| 35 |
1) save_pretrained() must produce a JSON-serializable preprocessor_config.json.
|
| 36 |
-
save_pretrained()는 JSON 직렬화 가능한 preprocessor_config.json을 생성해야
|
| 37 |
-
|
| 38 |
2) Runtime-only objects (delegate processor, timm/torchvision transforms) must NOT be serialized.
|
| 39 |
-
런타임 객체(delegate processor, timm/torchvision transform)는 절대 직렬화하면 안
|
| 40 |
-
|
| 41 |
3) Runtime objects are rebuilt at init/load time based on backbone meta.
|
| 42 |
-
런타임 객체는 backbone meta에 따라 init/load 시점에
|
| 43 |
-
|
| 44 |
4) For reproducibility, use_fast must be explicitly persisted and honored on load.
|
| 45 |
-
재현성을 위해 use_fast는 명시적으로 저장되고, 로드시 반드시 반영되어야
|
| 46 |
"""
|
| 47 |
|
| 48 |
# HF vision models conventionally expect "pixel_values" as the primary input key.
|
| 49 |
-
# HF vision 모델은 관례적으로 입력 키로 "pixel_values"를
|
| 50 |
model_input_names = ["pixel_values"]
|
| 51 |
|
| 52 |
def __init__(
|
| 53 |
self,
|
| 54 |
backbone_name_or_path: BackboneID,
|
| 55 |
-
is_training: bool = False,
|
| 56 |
use_fast: bool = False,
|
| 57 |
**kwargs,
|
| 58 |
):
|
| 59 |
# ImageProcessingMixin stores extra kwargs and manages auto_map metadata.
|
| 60 |
-
# ImageProcessingMixin은 추가 kwargs를 저장하고 auto_map 메타를
|
| 61 |
super().__init__(**kwargs)
|
| 62 |
|
| 63 |
# Enforce whitelist via BACKBONE_META to keep behavior stable.
|
| 64 |
-
# 동작 안정성을 위해 BACKBONE_META 기반 화이트리스트를
|
| 65 |
if backbone_name_or_path not in BACKBONE_META:
|
| 66 |
raise ValueError(
|
| 67 |
f"Unsupported backbone_name_or_path={backbone_name_or_path}. "
|
|
@@ -69,23 +66,23 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 69 |
)
|
| 70 |
|
| 71 |
# Serializable fields only: these should appear in preprocessor_config.json.
|
| 72 |
-
# 직렬화 가능한 필드만: 이 값들만 preprocessor_config.json에 들어가야
|
| 73 |
self.backbone_name_or_path = backbone_name_or_path
|
| 74 |
self.is_training = bool(is_training)
|
| 75 |
|
| 76 |
# Reproducibility switch for transformers processors.
|
| 77 |
-
# transformers processor의 fast/slow 선택을 재현 가능하게
|
| 78 |
self.use_fast = bool(use_fast)
|
| 79 |
|
| 80 |
# Runtime-only fields: must never be serialized.
|
| 81 |
-
# 런타임 전용 필드: 절대 직렬화되면 안
|
| 82 |
self._meta = None
|
| 83 |
-
self._delegate
|
| 84 |
-
self._timm_transform
|
| 85 |
self._torchvision_transform = None
|
| 86 |
|
| 87 |
# Build runtime objects according to backbone type.
|
| 88 |
-
# backbone type에 따라 런타임 객체를
|
| 89 |
self._build_runtime()
|
| 90 |
|
| 91 |
# ============================================================
|
|
@@ -95,13 +92,13 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 95 |
def _build_runtime(self):
|
| 96 |
"""
|
| 97 |
Build runtime delegate/transform based on BACKBONE_META["type"].
|
| 98 |
-
BACKBONE_META["type"]에 따라 런타임 delegate/transform을
|
| 99 |
"""
|
| 100 |
meta = BACKBONE_META[self.backbone_name_or_path]
|
| 101 |
self._meta = meta
|
| 102 |
|
| 103 |
# Always reset runtime fields before rebuilding.
|
| 104 |
-
# 재구성 전 런타임 필드는 항상
|
| 105 |
self._delegate = None
|
| 106 |
self._timm_transform = None
|
| 107 |
self._torchvision_transform = None
|
|
@@ -110,7 +107,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 110 |
|
| 111 |
if t == "timm_densenet":
|
| 112 |
# timm DenseNet uses timm.data transforms for ImageNet-style preprocessing.
|
| 113 |
-
# timm DenseNet은 ImageNet 전처리를 위해 timm.data transform을
|
| 114 |
self._timm_transform = self._build_timm_transform(
|
| 115 |
backbone_id=self.backbone_name_or_path,
|
| 116 |
is_training=self.is_training,
|
|
@@ -119,17 +116,17 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 119 |
|
| 120 |
if t == "torchvision_densenet":
|
| 121 |
# torchvision DenseNet requires torchvision-style preprocessing (resize/crop/tensor/normalize).
|
| 122 |
-
# torchvision DenseNet은 torchvision 스타일 전처리(resize/crop/tensor/normalize)가
|
| 123 |
self._torchvision_transform = self._build_torchvision_densenet_transform(
|
| 124 |
is_training=self.is_training
|
| 125 |
)
|
| 126 |
return
|
| 127 |
|
| 128 |
# Default: transformers backbone delegates to its official AutoImageProcessor.
|
| 129 |
-
# 기본: transformers 백본은 공식 AutoImageProcessor에
|
| 130 |
#
|
| 131 |
# IMPORTANT:
|
| 132 |
-
# - use_fast는 transformers 기본값 변경에 흔들리지 않도록 반드시 명시적으로
|
| 133 |
self._delegate = AutoImageProcessor.from_pretrained(
|
| 134 |
self.backbone_name_or_path,
|
| 135 |
use_fast=self.use_fast,
|
|
@@ -140,7 +137,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 140 |
def _build_timm_transform(*, backbone_id: str, is_training: bool):
|
| 141 |
"""
|
| 142 |
Create timm transform without storing non-serializable objects in config.
|
| 143 |
-
비직렬화 객체를 config에 저장하지 않고 timm transform을
|
| 144 |
"""
|
| 145 |
try:
|
| 146 |
import timm
|
|
@@ -151,20 +148,20 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 151 |
) from e
|
| 152 |
|
| 153 |
# We only need model metadata to resolve data config, so pretrained=False is preferred.
|
| 154 |
-
# data config 추출만 필요하므로 pretrained=False를 우선
|
| 155 |
m = timm.create_model(f"hf_hub:{backbone_id}", pretrained=False, num_classes=0)
|
| 156 |
dc = resolve_model_data_config(m)
|
| 157 |
|
| 158 |
# create_transform returns a torchvision-like callable that maps PIL -> torch.Tensor(C,H,W).
|
| 159 |
-
# create_transform은 PIL -> torch.Tensor(C,H,W)로 매핑하는 callable을
|
| 160 |
-
tfm = create_transform(**dc, is_training=is_training)
|
| 161 |
return tfm
|
| 162 |
|
| 163 |
@staticmethod
|
| 164 |
def _build_torchvision_densenet_transform(*, is_training: bool):
|
| 165 |
"""
|
| 166 |
Build torchvision preprocessing for DenseNet-121 (224 pipeline).
|
| 167 |
-
DenseNet-121용 torchvision 전처리(224 파이프라인)를
|
| 168 |
"""
|
| 169 |
try:
|
| 170 |
from torchvision import transforms
|
|
@@ -174,28 +171,29 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 174 |
) from e
|
| 175 |
|
| 176 |
# These are the standard ImageNet normalization stats used by torchvision weights.
|
| 177 |
-
# 이 값들은 torchvision weights가 사용하는 표준 ImageNet 정규화
|
| 178 |
mean = (0.485, 0.456, 0.406)
|
| 179 |
-
std
|
| 180 |
|
| 181 |
# Training pipeline typically uses RandomResizedCrop and horizontal flip.
|
| 182 |
-
# 학습 파이프라인은 보통 RandomResizedCrop과 좌우반전을
|
| 183 |
if is_training:
|
| 184 |
return transforms.Compose(
|
| 185 |
[
|
| 186 |
-
transforms.RandomResizedCrop(224),
|
| 187 |
-
transforms.RandomHorizontalFlip(p=0.5),
|
|
|
|
| 188 |
transforms.ToTensor(),
|
| 189 |
transforms.Normalize(mean=mean, std=std),
|
| 190 |
]
|
| 191 |
)
|
| 192 |
|
| 193 |
# Inference pipeline typically uses Resize(256) + CenterCrop(224).
|
| 194 |
-
# 추론 파이프라인은 보통 Resize(256) + CenterCrop(224)를
|
| 195 |
return transforms.Compose(
|
| 196 |
[
|
| 197 |
transforms.Resize(256),
|
| 198 |
-
transforms.CenterCrop(224),
|
| 199 |
transforms.ToTensor(),
|
| 200 |
transforms.Normalize(mean=mean, std=std),
|
| 201 |
]
|
|
@@ -208,24 +206,24 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 208 |
def to_dict(self) -> dict[str, Any]:
|
| 209 |
"""
|
| 210 |
Return a JSON-serializable dict for preprocessor_config.json.
|
| 211 |
-
preprocessor_config.json에 들어갈 JSON 직렬화 dict를
|
| 212 |
|
| 213 |
Important: do not leak runtime objects into the serialized dict.
|
| 214 |
-
중요: 런타임 객체가 직렬화 dict에 섞이면 안
|
| 215 |
"""
|
| 216 |
# ImageProcessingMixin.to_dict() adds metadata such as image_processor_type/auto_map.
|
| 217 |
# ImageProcessingMixin.to_dict()는 image_processor_type/auto_map 같은 메타를 추가합니다.
|
| 218 |
d = super().to_dict()
|
| 219 |
|
| 220 |
# Force minimal stable fields for long-term compatibility.
|
| 221 |
-
# 장기 호환을 위해 최소 안정 필드를
|
| 222 |
-
d["image_processor_type"]
|
| 223 |
d["backbone_name_or_path"] = self.backbone_name_or_path
|
| 224 |
d["is_training"] = self.is_training
|
| 225 |
-
d["use_fast"]
|
| 226 |
|
| 227 |
# Remove any runtime-only fields defensively.
|
| 228 |
-
# 런타임 전용 필드는 보수적으로
|
| 229 |
for key in ["_meta", "_delegate", "_timm_transform", "_torchvision_transform"]:
|
| 230 |
d.pop(key, None)
|
| 231 |
|
|
@@ -235,14 +233,14 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 235 |
def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs):
|
| 236 |
"""
|
| 237 |
Standard load path used by BaseImageProcessor / AutoImageProcessor.
|
| 238 |
-
BaseImageProcessor / AutoImageProcessor가 사용하는 표준 로드
|
| 239 |
"""
|
| 240 |
backbone = image_processor_dict.get("backbone_name_or_path", None)
|
| 241 |
if backbone is None:
|
| 242 |
raise ValueError("preprocessor_config.json missing key: backbone_name_or_path")
|
| 243 |
|
| 244 |
is_training = bool(image_processor_dict.get("is_training", False))
|
| 245 |
-
use_fast
|
| 246 |
|
| 247 |
return cls(
|
| 248 |
backbone_name_or_path=backbone,
|
|
@@ -255,20 +253,20 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 255 |
def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
|
| 256 |
"""
|
| 257 |
Fallback path if AutoImageProcessor calls class.from_pretrained directly.
|
| 258 |
-
AutoImageProcessor가 class.from_pretrained를 직접 호출하는 경우를 대비한
|
| 259 |
|
| 260 |
Strategy:
|
| 261 |
전략:
|
| 262 |
|
| 263 |
- Read config.json via AutoConfig and recover backbone_name_or_path.
|
| 264 |
-
AutoConfig로 config.json을 읽고 backbone_name_or_path를
|
| 265 |
"""
|
| 266 |
|
| 267 |
# is_training is runtime-only and should default to False for inference/serving.
|
| 268 |
-
# is_training은 런타임 전용이며 추론/서빙 기본값은 False
|
| 269 |
#
|
| 270 |
# IMPORTANT:
|
| 271 |
-
# - use_fast는 kwargs로 전달될 수 있으므로, 있으면
|
| 272 |
use_fast = bool(kwargs.pop("use_fast", False))
|
| 273 |
|
| 274 |
kwargs.pop("trust_remote_code", None)
|
|
@@ -289,7 +287,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 289 |
@staticmethod
|
| 290 |
def _ensure_list(images: Any) -> list[Any]:
|
| 291 |
# Normalize scalar image input to a list for uniform processing.
|
| 292 |
-
# 단일 입력을 리스트로 정규화하여 동일한 처리 경로를
|
| 293 |
if isinstance(images, (list, tuple)):
|
| 294 |
return list(images)
|
| 295 |
return [images]
|
|
@@ -297,7 +295,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 297 |
@staticmethod
|
| 298 |
def _to_pil_rgb(x: Any):
|
| 299 |
# Convert common image inputs into PIL RGB images.
|
| 300 |
-
# 일반적인 입력을 PIL RGB 이미지로
|
| 301 |
from PIL import Image as PILImage
|
| 302 |
|
| 303 |
if isinstance(x, PILImage.Image):
|
|
@@ -314,17 +312,17 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 314 |
) -> dict[str, Any]:
|
| 315 |
"""
|
| 316 |
Convert images into {"pixel_values": Tensor/ndarray}.
|
| 317 |
-
이미지를 {"pixel_values": Tensor/ndarray}로
|
| 318 |
"""
|
| 319 |
images = self._ensure_list(images)
|
| 320 |
|
| 321 |
# Rebuild runtime if needed (e.g., right after deserialization).
|
| 322 |
-
# 직렬화 복원 직후 등 런타임이 비어있을 수 있으므로
|
| 323 |
if (self._delegate is None) and (self._timm_transform is None) and (self._torchvision_transform is None):
|
| 324 |
self._build_runtime()
|
| 325 |
|
| 326 |
# timm path: PIL -> torch.Tensor(C,H,W) normalized float32.
|
| 327 |
-
# timm 경로: PIL -> torch.Tensor(C,H,W) 정규화 float32
|
| 328 |
if self._timm_transform is not None:
|
| 329 |
pv: list[torch.Tensor] = []
|
| 330 |
for im in images:
|
|
@@ -337,7 +335,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 337 |
return self._format_return(pixel_values, return_tensors)
|
| 338 |
|
| 339 |
# torchvision path: PIL -> torch.Tensor(C,H,W) normalized float32.
|
| 340 |
-
# torchvision 경로: PIL -> torch.Tensor(C,H,W) 정규화 float32
|
| 341 |
if self._torchvision_transform is not None:
|
| 342 |
pv: list[torch.Tensor] = []
|
| 343 |
for im in images:
|
|
@@ -350,7 +348,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 350 |
return self._format_return(pixel_values, return_tensors)
|
| 351 |
|
| 352 |
# transformers delegate path: rely on official processor behavior.
|
| 353 |
-
# transformers 위임 경로: 공식 processor 동작을 그대로
|
| 354 |
if self._delegate is None:
|
| 355 |
raise RuntimeError("Processor runtime not built: delegate is None and no transforms are available.")
|
| 356 |
|
|
@@ -360,7 +358,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 360 |
def _format_return(pixel_values: torch.Tensor, return_tensors: str | TensorType | None) -> dict[str, Any]:
|
| 361 |
"""
|
| 362 |
Format pixel_values according to return_tensors.
|
| 363 |
-
return_tensors에 맞춰 pixel_values 반환 포맷을
|
| 364 |
"""
|
| 365 |
if return_tensors is None or return_tensors in ("pt", TensorType.PYTORCH):
|
| 366 |
return {"pixel_values": pixel_values}
|
|
@@ -370,6 +368,6 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 370 |
|
| 371 |
|
| 372 |
# Register this processor for AutoImageProcessor resolution.
|
| 373 |
-
# AutoImageProcessor 해석을 위해 이 processor를
|
| 374 |
if __name__ != "__main__":
|
| 375 |
BackboneMLPHead224ImageProcessor.register_for_auto_class("AutoImageProcessor")
|
|
|
|
| 4 |
# src/ds_proc.py
|
| 5 |
|
| 6 |
# ============================================================
|
| 7 |
+
# ImageProcessor (AutoImageProcessor integration)
|
| 8 |
+
# ImageProcessor (AutoImageProcessor 연동)
|
| 9 |
# ============================================================
|
| 10 |
|
| 11 |
from typing import Any
|
|
|
|
| 27 |
class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
| 28 |
"""
|
| 29 |
This processor performs image preprocessing and outputs {"pixel_values": ...}.
|
| 30 |
+
이 processor는 이미지 전처리를 수행하고 {"pixel_values": ...}를 반환함.
|
| 31 |
|
| 32 |
Key requirements:
|
| 33 |
핵심 요구사항:
|
| 34 |
|
| 35 |
1) save_pretrained() must produce a JSON-serializable preprocessor_config.json.
|
| 36 |
+
save_pretrained()는 JSON 직렬화 가능한 preprocessor_config.json을 생성해야 함.
|
|
|
|
| 37 |
2) Runtime-only objects (delegate processor, timm/torchvision transforms) must NOT be serialized.
|
| 38 |
+
런타임 객체(delegate processor, timm/torchvision transform)는 절대 직렬화하면 안 됨.
|
|
|
|
| 39 |
3) Runtime objects are rebuilt at init/load time based on backbone meta.
|
| 40 |
+
런타임 객체는 backbone meta에 따라 init/load 시점에 재구성.
|
|
|
|
| 41 |
4) For reproducibility, use_fast must be explicitly persisted and honored on load.
|
| 42 |
+
재현성을 위해 use_fast는 명시적으로 저장되고, 로드시 반드시 반영되어야 함.
|
| 43 |
"""
|
| 44 |
|
| 45 |
# HF vision models conventionally expect "pixel_values" as the primary input key.
|
| 46 |
+
# HF vision 모델은 관례적으로 입력 키로 "pixel_values"를 기대.
|
| 47 |
model_input_names = ["pixel_values"]
|
| 48 |
|
| 49 |
def __init__(
|
| 50 |
self,
|
| 51 |
backbone_name_or_path: BackboneID,
|
| 52 |
+
is_training: bool = False, # timm 에서 data augmentation 용.
|
| 53 |
use_fast: bool = False,
|
| 54 |
**kwargs,
|
| 55 |
):
|
| 56 |
# ImageProcessingMixin stores extra kwargs and manages auto_map metadata.
|
| 57 |
+
# ImageProcessingMixin은 추가 kwargs를 저장하고 auto_map 메타를 관리.
|
| 58 |
super().__init__(**kwargs)
|
| 59 |
|
| 60 |
# Enforce whitelist via BACKBONE_META to keep behavior stable.
|
| 61 |
+
# 동작 안정성을 위해 BACKBONE_META 기반 화이트리스트를 강제. - fast fail
|
| 62 |
if backbone_name_or_path not in BACKBONE_META:
|
| 63 |
raise ValueError(
|
| 64 |
f"Unsupported backbone_name_or_path={backbone_name_or_path}. "
|
|
|
|
| 66 |
)
|
| 67 |
|
| 68 |
# Serializable fields only: these should appear in preprocessor_config.json.
|
| 69 |
+
# 직렬화 가능한 필드만: 이 값들만 preprocessor_config.json에 들어가야 함
|
| 70 |
self.backbone_name_or_path = backbone_name_or_path
|
| 71 |
self.is_training = bool(is_training)
|
| 72 |
|
| 73 |
# Reproducibility switch for transformers processors.
|
| 74 |
+
# transformers processor의 fast/slow 선택을 재현 가능하게 고정.
|
| 75 |
self.use_fast = bool(use_fast)
|
| 76 |
|
| 77 |
# Runtime-only fields: must never be serialized.
|
| 78 |
+
# 런타임 전용 필드: 절대 직렬화되면 안 됨.
|
| 79 |
self._meta = None
|
| 80 |
+
self._delegate = None
|
| 81 |
+
self._timm_transform = None
|
| 82 |
self._torchvision_transform = None
|
| 83 |
|
| 84 |
# Build runtime objects according to backbone type.
|
| 85 |
+
# backbone type에 따라 런타임 객체를 구성.
|
| 86 |
self._build_runtime()
|
| 87 |
|
| 88 |
# ============================================================
|
|
|
|
| 92 |
def _build_runtime(self):
|
| 93 |
"""
|
| 94 |
Build runtime delegate/transform based on BACKBONE_META["type"].
|
| 95 |
+
BACKBONE_META["type"]에 따라 런타임 delegate/transform을 구성.
|
| 96 |
"""
|
| 97 |
meta = BACKBONE_META[self.backbone_name_or_path]
|
| 98 |
self._meta = meta
|
| 99 |
|
| 100 |
# Always reset runtime fields before rebuilding.
|
| 101 |
+
# 재구성 전 런타임 필드는 항상 초기화.
|
| 102 |
self._delegate = None
|
| 103 |
self._timm_transform = None
|
| 104 |
self._torchvision_transform = None
|
|
|
|
| 107 |
|
| 108 |
if t == "timm_densenet":
|
| 109 |
# timm DenseNet uses timm.data transforms for ImageNet-style preprocessing.
|
| 110 |
+
# timm DenseNet은 ImageNet 전처리를 위해 timm.data transform을 사용.
|
| 111 |
self._timm_transform = self._build_timm_transform(
|
| 112 |
backbone_id=self.backbone_name_or_path,
|
| 113 |
is_training=self.is_training,
|
|
|
|
| 116 |
|
| 117 |
if t == "torchvision_densenet":
|
| 118 |
# torchvision DenseNet requires torchvision-style preprocessing (resize/crop/tensor/normalize).
|
| 119 |
+
# torchvision DenseNet은 torchvision 스타일 전처리(resize/crop/tensor/normalize)가 필요.
|
| 120 |
self._torchvision_transform = self._build_torchvision_densenet_transform(
|
| 121 |
is_training=self.is_training
|
| 122 |
)
|
| 123 |
return
|
| 124 |
|
| 125 |
# Default: transformers backbone delegates to its official AutoImageProcessor.
|
| 126 |
+
# 기본: transformers 백본은 공식 AutoImageProcessor에 위임.
|
| 127 |
#
|
| 128 |
# IMPORTANT:
|
| 129 |
+
# - use_fast는 transformers 기본값 변경에 흔들리지 않도록 반드시 명시적으로 전달.
|
| 130 |
self._delegate = AutoImageProcessor.from_pretrained(
|
| 131 |
self.backbone_name_or_path,
|
| 132 |
use_fast=self.use_fast,
|
|
|
|
| 137 |
def _build_timm_transform(*, backbone_id: str, is_training: bool):
|
| 138 |
"""
|
| 139 |
Create timm transform without storing non-serializable objects in config.
|
| 140 |
+
비직렬화 객체를 config에 저장하지 않고 timm transform을 생성.
|
| 141 |
"""
|
| 142 |
try:
|
| 143 |
import timm
|
|
|
|
| 148 |
) from e
|
| 149 |
|
| 150 |
# We only need model metadata to resolve data config, so pretrained=False is preferred.
|
| 151 |
+
# data config 추출만 필요하므로 pretrained=False를 우선 사용.
|
| 152 |
m = timm.create_model(f"hf_hub:{backbone_id}", pretrained=False, num_classes=0)
|
| 153 |
dc = resolve_model_data_config(m)
|
| 154 |
|
| 155 |
# create_transform returns a torchvision-like callable that maps PIL -> torch.Tensor(C,H,W).
|
| 156 |
+
# create_transform은 PIL -> torch.Tensor(C,H,W)로 매핑하는 callable을 반환.
|
| 157 |
+
tfm = create_transform(**dc, is_training=is_training) # is_training :Data Aug.
|
| 158 |
return tfm
|
| 159 |
|
| 160 |
@staticmethod
|
| 161 |
def _build_torchvision_densenet_transform(*, is_training: bool):
|
| 162 |
"""
|
| 163 |
Build torchvision preprocessing for DenseNet-121 (224 pipeline).
|
| 164 |
+
DenseNet-121용 torchvision 전처리(224 파이프라인)를 구성.
|
| 165 |
"""
|
| 166 |
try:
|
| 167 |
from torchvision import transforms
|
|
|
|
| 171 |
) from e
|
| 172 |
|
| 173 |
# These are the standard ImageNet normalization stats used by torchvision weights.
|
| 174 |
+
# 이 값들은 torchvision weights가 사용하는 표준 ImageNet 정규화 통계.
|
| 175 |
mean = (0.485, 0.456, 0.406)
|
| 176 |
+
std = (0.229, 0.224, 0.225)
|
| 177 |
|
| 178 |
# Training pipeline typically uses RandomResizedCrop and horizontal flip.
|
| 179 |
+
# 학습 파이프라인은 보통 RandomResizedCrop과 좌우반전을 사용.
|
| 180 |
if is_training:
|
| 181 |
return transforms.Compose(
|
| 182 |
[
|
| 183 |
+
# transforms.RandomResizedCrop(224),
|
| 184 |
+
# transforms.RandomHorizontalFlip(p=0.5),
|
| 185 |
+
transforms.Resize(224),
|
| 186 |
transforms.ToTensor(),
|
| 187 |
transforms.Normalize(mean=mean, std=std),
|
| 188 |
]
|
| 189 |
)
|
| 190 |
|
| 191 |
# Inference pipeline typically uses Resize(256) + CenterCrop(224).
|
| 192 |
+
# 추론 파이프라인은 보통 Resize(256) + CenterCrop(224)를 사용.
|
| 193 |
return transforms.Compose(
|
| 194 |
[
|
| 195 |
transforms.Resize(256),
|
| 196 |
+
# transforms.CenterCrop(224),
|
| 197 |
transforms.ToTensor(),
|
| 198 |
transforms.Normalize(mean=mean, std=std),
|
| 199 |
]
|
|
|
|
| 206 |
def to_dict(self) -> dict[str, Any]:
|
| 207 |
"""
|
| 208 |
Return a JSON-serializable dict for preprocessor_config.json.
|
| 209 |
+
preprocessor_config.json에 들어갈 JSON 직렬화 dict를 반환.
|
| 210 |
|
| 211 |
Important: do not leak runtime objects into the serialized dict.
|
| 212 |
+
중요: 런타임 객체가 직렬화 dict에 섞이면 안 됨.
|
| 213 |
"""
|
| 214 |
# ImageProcessingMixin.to_dict() adds metadata such as image_processor_type/auto_map.
|
| 215 |
# ImageProcessingMixin.to_dict()는 image_processor_type/auto_map 같은 메타를 추가합니다.
|
| 216 |
d = super().to_dict()
|
| 217 |
|
| 218 |
# Force minimal stable fields for long-term compatibility.
|
| 219 |
+
# 장기 호환을 위해 최소 안정 필드를 강제로 지정.
|
| 220 |
+
d["image_processor_type"] = self.__class__.__name__
|
| 221 |
d["backbone_name_or_path"] = self.backbone_name_or_path
|
| 222 |
d["is_training"] = self.is_training
|
| 223 |
+
d["use_fast"] = self.use_fast
|
| 224 |
|
| 225 |
# Remove any runtime-only fields defensively.
|
| 226 |
+
# 런타임 전용 필드는 보수적으로 제거.
|
| 227 |
for key in ["_meta", "_delegate", "_timm_transform", "_torchvision_transform"]:
|
| 228 |
d.pop(key, None)
|
| 229 |
|
|
|
|
| 233 |
def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs):
|
| 234 |
"""
|
| 235 |
Standard load path used by BaseImageProcessor / AutoImageProcessor.
|
| 236 |
+
BaseImageProcessor / AutoImageProcessor가 사용하는 표준 로드 경로임.
|
| 237 |
"""
|
| 238 |
backbone = image_processor_dict.get("backbone_name_or_path", None)
|
| 239 |
if backbone is None:
|
| 240 |
raise ValueError("preprocessor_config.json missing key: backbone_name_or_path")
|
| 241 |
|
| 242 |
is_training = bool(image_processor_dict.get("is_training", False))
|
| 243 |
+
use_fast = bool(image_processor_dict.get("use_fast", False))
|
| 244 |
|
| 245 |
return cls(
|
| 246 |
backbone_name_or_path=backbone,
|
|
|
|
| 253 |
def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
|
| 254 |
"""
|
| 255 |
Fallback path if AutoImageProcessor calls class.from_pretrained directly.
|
| 256 |
+
AutoImageProcessor가 class.from_pretrained를 직접 호출하는 경우를 대비한 메서드.
|
| 257 |
|
| 258 |
Strategy:
|
| 259 |
전략:
|
| 260 |
|
| 261 |
- Read config.json via AutoConfig and recover backbone_name_or_path.
|
| 262 |
+
AutoConfig로 config.json을 읽고 backbone_name_or_path를 복구.
|
| 263 |
"""
|
| 264 |
|
| 265 |
# is_training is runtime-only and should default to False for inference/serving.
|
| 266 |
+
# is_training은 런타임 전용이며 추론/서빙 기본값은 False 임.
|
| 267 |
#
|
| 268 |
# IMPORTANT:
|
| 269 |
+
# - use_fast는 kwargs로 전달될 수 있으므로, 있으면 반영.
|
| 270 |
use_fast = bool(kwargs.pop("use_fast", False))
|
| 271 |
|
| 272 |
kwargs.pop("trust_remote_code", None)
|
|
|
|
| 287 |
@staticmethod
|
| 288 |
def _ensure_list(images: Any) -> list[Any]:
|
| 289 |
# Normalize scalar image input to a list for uniform processing.
|
| 290 |
+
# 단일 입력을 리스트로 정규화하여 동일한 처리 경로를 사용.
|
| 291 |
if isinstance(images, (list, tuple)):
|
| 292 |
return list(images)
|
| 293 |
return [images]
|
|
|
|
| 295 |
@staticmethod
|
| 296 |
def _to_pil_rgb(x: Any):
|
| 297 |
# Convert common image inputs into PIL RGB images.
|
| 298 |
+
# 일반적인 입력을 PIL RGB 이미지로 변환.
|
| 299 |
from PIL import Image as PILImage
|
| 300 |
|
| 301 |
if isinstance(x, PILImage.Image):
|
|
|
|
| 312 |
) -> dict[str, Any]:
|
| 313 |
"""
|
| 314 |
Convert images into {"pixel_values": Tensor/ndarray}.
|
| 315 |
+
이미지를 {"pixel_values": Tensor/ndarray}로 변환.
|
| 316 |
"""
|
| 317 |
images = self._ensure_list(images)
|
| 318 |
|
| 319 |
# Rebuild runtime if needed (e.g., right after deserialization).
|
| 320 |
+
# 직렬화 복원 직후 등 런타임이 비어있을 수 있으므로 재구성.
|
| 321 |
if (self._delegate is None) and (self._timm_transform is None) and (self._torchvision_transform is None):
|
| 322 |
self._build_runtime()
|
| 323 |
|
| 324 |
# timm path: PIL -> torch.Tensor(C,H,W) normalized float32.
|
| 325 |
+
# timm 경로: PIL -> torch.Tensor(C,H,W) 정규화 float32.
|
| 326 |
if self._timm_transform is not None:
|
| 327 |
pv: list[torch.Tensor] = []
|
| 328 |
for im in images:
|
|
|
|
| 335 |
return self._format_return(pixel_values, return_tensors)
|
| 336 |
|
| 337 |
# torchvision path: PIL -> torch.Tensor(C,H,W) normalized float32.
|
| 338 |
+
# torchvision 경로: PIL -> torch.Tensor(C,H,W) 정규화 float32.
|
| 339 |
if self._torchvision_transform is not None:
|
| 340 |
pv: list[torch.Tensor] = []
|
| 341 |
for im in images:
|
|
|
|
| 348 |
return self._format_return(pixel_values, return_tensors)
|
| 349 |
|
| 350 |
# transformers delegate path: rely on official processor behavior.
|
| 351 |
+
# transformers 위임 경로: 공식 processor 동작을 그대로 사용.
|
| 352 |
if self._delegate is None:
|
| 353 |
raise RuntimeError("Processor runtime not built: delegate is None and no transforms are available.")
|
| 354 |
|
|
|
|
| 358 |
def _format_return(pixel_values: torch.Tensor, return_tensors: str | TensorType | None) -> dict[str, Any]:
|
| 359 |
"""
|
| 360 |
Format pixel_values according to return_tensors.
|
| 361 |
+
return_tensors에 맞춰 pixel_values 반환 포맷을 변환.
|
| 362 |
"""
|
| 363 |
if return_tensors is None or return_tensors in ("pt", TensorType.PYTORCH):
|
| 364 |
return {"pixel_values": pixel_values}
|
|
|
|
| 368 |
|
| 369 |
|
| 370 |
# Register this processor for AutoImageProcessor resolution.
|
| 371 |
+
# AutoImageProcessor 해석을 위해 이 processor를 등록.
|
| 372 |
if __name__ != "__main__":
|
| 373 |
BackboneMLPHead224ImageProcessor.register_for_auto_class("AutoImageProcessor")
|
models/google__efficientnet-b0/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 17558436
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:865ceddfa7c4eb1c24844a2bea075ff25ee577fabb2886069b5478cd27bf6cac
|
| 3 |
size 17558436
|
models/google__vit-base-patch16-224/config.json
CHANGED
|
@@ -24,7 +24,7 @@
|
|
| 24 |
"num_labels": 3,
|
| 25 |
"transformers_version": "5.1.0",
|
| 26 |
"ds_provenance": {
|
| 27 |
-
"created_at": "
|
| 28 |
"repo_id": "dsaint31/bb_mlp_224",
|
| 29 |
"subdir": "models/google__vit-base-patch16-224",
|
| 30 |
"wrapper_class": "BackboneWithMLPHeadForImageClassification",
|
|
|
|
| 24 |
"num_labels": 3,
|
| 25 |
"transformers_version": "5.1.0",
|
| 26 |
"ds_provenance": {
|
| 27 |
+
"created_at": "20260212_202546",
|
| 28 |
"repo_id": "dsaint31/bb_mlp_224",
|
| 29 |
"subdir": "models/google__vit-base-patch16-224",
|
| 30 |
"wrapper_class": "BackboneWithMLPHeadForImageClassification",
|
models/google__vit-base-patch16-224/ds_model.py
CHANGED
|
@@ -94,14 +94,14 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 94 |
|
| 95 |
def __init__(self, config: BackboneMLPHeadConfig):
|
| 96 |
# PreTrainedModel expects a config object and stores it internally.
|
| 97 |
-
# PreTrainedModel은 config 객체를 받아 내부에
|
| 98 |
super().__init__(config)
|
| 99 |
|
| 100 |
# Fail-fast: the model is not meant to be instantiated without a valid backbone id.
|
| 101 |
-
# fail-fast: 유효한 backbone id 없이 모델을 만드는 사용 시나리오는 허용하지
|
| 102 |
#
|
| 103 |
# Note: Transformers may create configs with no args, but models are conventionally created with configs.
|
| 104 |
-
# 참고: Transformers는 config 무인자 생성이 있을 수 있으나, 모델은 관례적으로 config를 받아
|
| 105 |
if config.backbone_name_or_path is None:
|
| 106 |
raise ValueError(
|
| 107 |
"config.backbone_name_or_path is None. "
|
|
@@ -109,10 +109,10 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 109 |
)
|
| 110 |
|
| 111 |
# Fail-fast: training/inference requires a positive number of labels.
|
| 112 |
-
# fail-fast: 학습/추론은 num_labels가 양수여야
|
| 113 |
#
|
| 114 |
# Config may exist in a minimal form for internal serialization paths, but the model should not.
|
| 115 |
-
# config는 내부 직렬화 경로에서 최소 형태로 존재할 수 있으나 모델은
|
| 116 |
if int(getattr(config, "num_labels", 0)) <= 0:
|
| 117 |
raise ValueError(
|
| 118 |
f"config.num_labels must be > 0, got {getattr(config, 'num_labels', None)}. "
|
|
@@ -120,17 +120,17 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 120 |
)
|
| 121 |
|
| 122 |
# Meta is a single source of truth for extraction and fine-tuning rules.
|
| 123 |
-
# meta는 feature 추출 및 미세조정 규칙의 단일
|
| 124 |
-
|
| 125 |
# Prefer config.backbone_meta to keep Hub runtime self-contained.
|
| 126 |
self._meta = _resolve_backbone_meta(config, fallback_table=BACKBONE_META)
|
| 127 |
|
| 128 |
# Backbone skeleton is always created without pretrained weights.
|
| 129 |
-
# backbone skeleton은 항상 pretrained weight 없이
|
| 130 |
self.backbone = self._build_backbone_skeleton(config.backbone_name_or_path)
|
| 131 |
|
| 132 |
# Head shape is driven by meta feat_dim and config.num_labels.
|
| 133 |
-
# head shape은 meta의 feat_dim과 config.num_labels로
|
| 134 |
self.classifier = MLPHead(
|
| 135 |
in_dim=int(self._meta["feat_dim"]),
|
| 136 |
num_labels=int(config.num_labels),
|
|
@@ -139,16 +139,20 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 139 |
)
|
| 140 |
|
| 141 |
# HF initialization hook, but we override init_weights to initialize head-only.
|
| 142 |
-
# HF 초기화 훅이지만 init_weights를 override하여 head만
|
| 143 |
self.post_init()
|
| 144 |
|
| 145 |
def init_weights(self):
|
| 146 |
"""
|
| 147 |
Initialize only the head to avoid touching the backbone skeleton.
|
| 148 |
-
backbone skeleton을 건드리지 않기 위해 head만
|
| 149 |
|
| 150 |
HF's default init may traverse the entire module tree, which is undesirable here.
|
| 151 |
-
HF 기본 init은 전체 모듈 트리를 순회할 수 있어
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
"""
|
| 153 |
if getattr(self, "classifier", None) is not None:
|
| 154 |
self.classifier.apply(self._init_weights)
|
|
@@ -160,7 +164,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 160 |
# ----------------------------
|
| 161 |
def _build_backbone_skeleton(self, backbone_id: str) -> nn.Module:
|
| 162 |
# Meta decides which loader path to use.
|
| 163 |
-
# meta가 어떤 로더 경로를 사용할지
|
| 164 |
meta = self._meta if backbone_id == self.config.backbone_name_or_path else BACKBONE_META.get(backbone_id)
|
| 165 |
if meta is None:
|
| 166 |
raise KeyError(f"Unknown backbone_id={backbone_id}. Provide backbone_meta in config or extend BACKBONE_META.")
|
|
@@ -174,14 +178,14 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 174 |
return self._build_torchvision_densenet_skeleton(backbone_id)
|
| 175 |
|
| 176 |
# For transformers backbones: build a random-weight skeleton from config only.
|
| 177 |
-
# transformers 백본: config로부터 랜덤 초기화 skeleton만
|
| 178 |
bb_cfg = AutoConfig.from_pretrained(backbone_id)
|
| 179 |
return AutoModel.from_config(bb_cfg)
|
| 180 |
|
| 181 |
@staticmethod
|
| 182 |
def _build_timm_densenet_skeleton(hf_repo_id: str) -> nn.Module:
|
| 183 |
# timm is an optional dependency and should be imported lazily.
|
| 184 |
-
# timm은 옵션 의존성이므로 지연 import
|
| 185 |
try:
|
| 186 |
import timm
|
| 187 |
except Exception as e:
|
|
@@ -190,7 +194,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 190 |
) from e
|
| 191 |
|
| 192 |
# Build structure only (pretrained=False) and remove classifier head (num_classes=0).
|
| 193 |
-
# 구조만 생성(pretrained=False)하고 분류기 head는 제거(num_classes=0)
|
| 194 |
return timm.create_model(
|
| 195 |
f"hf_hub:{hf_repo_id}",
|
| 196 |
pretrained=False,
|
|
@@ -200,12 +204,12 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 200 |
@staticmethod
|
| 201 |
def _build_torchvision_densenet_skeleton(model_id: str) -> nn.Module:
|
| 202 |
# This project intentionally supports only torchvision/densenet121 in the 224 whitelist.
|
| 203 |
-
# 이 프로젝트는 224 화이트리스트에서 torchvision/densenet121만 의도적으로
|
| 204 |
if model_id != "torchvision/densenet121":
|
| 205 |
raise ValueError(f"Unsupported torchvision DenseNet id (224 whitelist only): {model_id}")
|
| 206 |
|
| 207 |
# Build structure only (weights=None) to avoid implicit pretrained loading.
|
| 208 |
-
# implicit pretrained 로드를 피하기 위해 구조만 생성(weights=None)
|
| 209 |
m = tv_models.densenet121(weights=None)
|
| 210 |
return m
|
| 211 |
|
|
@@ -222,10 +226,10 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 222 |
):
|
| 223 |
"""
|
| 224 |
Fresh-start only: inject pretrained backbone weights into the skeleton.
|
| 225 |
-
fresh-start 전용: skeleton backbone에 pretrained 가중치를
|
| 226 |
|
| 227 |
Do NOT call this after from_pretrained() because it would overwrite checkpoint weights.
|
| 228 |
-
from_pretrained() 이후 호출하면 체크포인트 가중치를 덮어쓰므로
|
| 229 |
"""
|
| 230 |
bb = self.config.backbone_name_or_path
|
| 231 |
meta = self._meta
|
|
@@ -240,7 +244,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 240 |
return
|
| 241 |
|
| 242 |
# For transformers backbones, load a reference pretrained model and copy weights into our skeleton.
|
| 243 |
-
# transformers 백본은 reference pretrained 모델을 로드한 뒤 skeleton에 가중치를
|
| 244 |
ref = AutoModel.from_pretrained(
|
| 245 |
bb,
|
| 246 |
low_cpu_mem_usage=low_cpu_mem_usage,
|
|
@@ -248,18 +252,18 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 248 |
)
|
| 249 |
|
| 250 |
# strict=False is used to tolerate harmless key differences across minor versions.
|
| 251 |
-
# strict=False는 마이너 버전 차이로 인한 무해한 키 차이를 허용하기 위해
|
| 252 |
self.backbone.load_state_dict(ref.state_dict(), strict=False)
|
| 253 |
del ref
|
| 254 |
|
| 255 |
@torch.no_grad()
|
| 256 |
def _load_timm_pretrained_into_skeleton_(self, hf_repo_id: str):
|
| 257 |
# timm must be present for timm backbones.
|
| 258 |
-
# timm
|
| 259 |
import timm
|
| 260 |
|
| 261 |
# Create a pretrained reference model and copy its weights strictly.
|
| 262 |
-
# pretrained reference 모델을 만들고 가중치를 strict하게
|
| 263 |
ref = timm.create_model(
|
| 264 |
f"hf_hub:{hf_repo_id}",
|
| 265 |
pretrained=True,
|
|
@@ -272,12 +276,12 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 272 |
@torch.no_grad()
|
| 273 |
def _load_torchvision_pretrained_into_skeleton_(self, model_id: str):
|
| 274 |
# This project intentionally supports only torchvision/densenet121 in the 224 whitelist.
|
| 275 |
-
# 이 프로젝트는 224 화이트리스트에서 torchvision/densenet121만
|
| 276 |
if model_id != "torchvision/densenet121":
|
| 277 |
raise ValueError(f"Unsupported torchvision DenseNet id (224 whitelist only): {model_id}")
|
| 278 |
|
| 279 |
# Use torchvision's default pretrained weights for densenet121.
|
| 280 |
-
# torchvision의 densenet121 기본 pretrained weights를
|
| 281 |
ref = tv_models.densenet121(weights=tv_models.DenseNet121_Weights.DEFAULT).eval()
|
| 282 |
|
| 283 |
self.backbone.load_state_dict(ref.state_dict(), strict=True)
|
|
@@ -290,7 +294,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 290 |
@staticmethod
|
| 291 |
def _pool_or_gap(outputs) -> torch.Tensor:
|
| 292 |
# Some transformers vision CNNs provide pooler_output explicitly.
|
| 293 |
-
# 일부 transformers vision CNN은 pooler_output을 명시적으로
|
| 294 |
if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
|
| 295 |
x = outputs.pooler_output
|
| 296 |
if x.dim() == 2:
|
|
@@ -300,7 +304,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 300 |
raise RuntimeError(f"Unexpected pooler_output shape: {tuple(x.shape)}")
|
| 301 |
|
| 302 |
# Otherwise we expect a CNN-style last_hidden_state=(B,C,H,W) and apply GAP.
|
| 303 |
-
# 그렇지 않으면 CNN 스타일 last_hidden_state=(B,C,H,W)를 기대하고 GAP을
|
| 304 |
x = outputs.last_hidden_state
|
| 305 |
if x.dim() == 4:
|
| 306 |
return x.mean(dim=(2, 3))
|
|
@@ -312,29 +316,29 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 312 |
|
| 313 |
def _extract_features(self, outputs, pixel_values: Optional[torch.Tensor] = None) -> torch.Tensor:
|
| 314 |
# Feature rule is defined by BACKBONE_META and must remain stable across saves/loads.
|
| 315 |
-
# feature 규칙은 BACKBONE_META로 정의되며 저장/로드 간
|
| 316 |
rule = self._meta["feat_rule"]
|
| 317 |
|
| 318 |
if rule == "cls":
|
| 319 |
# ViT-style: use CLS token embedding from last_hidden_state.
|
| 320 |
-
# ViT 스타일: last_hidden_state에서 CLS 토큰 임베딩을
|
| 321 |
return outputs.last_hidden_state[:, 0, :]
|
| 322 |
|
| 323 |
if rule == "pool_or_mean":
|
| 324 |
# Swin-style: prefer pooler_output if present, else mean-pool over tokens.
|
| 325 |
-
# Swin 스타일: pooler_output이 있으면 우선 사용하고, 없으면 토큰 평균 풀링을
|
| 326 |
if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
|
| 327 |
return outputs.pooler_output
|
| 328 |
return outputs.last_hidden_state.mean(dim=1)
|
| 329 |
|
| 330 |
if rule == "pool_or_gap":
|
| 331 |
# CNN-style: use pooler_output if present, else GAP over spatial dims.
|
| 332 |
-
# CNN 스타일: pooler_output이 있으면 사용하고, 없으면 공간 차원 GAP을
|
| 333 |
return self._pool_or_gap(outputs)
|
| 334 |
|
| 335 |
if rule == "timm_gap":
|
| 336 |
# timm forward_features returns a feature map (B,C,H,W) which we GAP to (B,C).
|
| 337 |
-
# timm forward_features는 (B,C,H,W) feature map을 반환하며 이를 GAP으로 (B,C)로
|
| 338 |
if not isinstance(outputs, torch.Tensor):
|
| 339 |
raise TypeError(f"timm_gap expects Tensor features, got {type(outputs)}")
|
| 340 |
if outputs.dim() != 4:
|
|
@@ -343,7 +347,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 343 |
|
| 344 |
if rule == "torchvision_densenet_gap":
|
| 345 |
# torchvision DenseNet features are feature maps (B,C,H,W) and require GAP.
|
| 346 |
-
# torchvision DenseNet features는 (B,C,H,W) feature map이며 GAP이
|
| 347 |
if not isinstance(outputs, torch.Tensor):
|
| 348 |
raise TypeError(f"torchvision_densenet_gap expects Tensor, got {type(outputs)}")
|
| 349 |
if outputs.dim() != 4:
|
|
@@ -362,7 +366,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 362 |
**kwargs,
|
| 363 |
):
|
| 364 |
# Type decides the backbone forward path and output format.
|
| 365 |
-
# type이 backbone forward 경로 및 출력 포맷을
|
| 366 |
t = self._meta["type"]
|
| 367 |
|
| 368 |
if t == "timm_densenet":
|
|
@@ -394,7 +398,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 394 |
|
| 395 |
else:
|
| 396 |
# Transformers vision models are called with pixel_values and return ModelOutput.
|
| 397 |
-
# transformers vision 모델은 pixel_values로 호출되며 ModelOutput을
|
| 398 |
outputs = self.backbone(
|
| 399 |
pixel_values=pixel_values,
|
| 400 |
output_attentions=output_attentions,
|
|
@@ -407,13 +411,13 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 407 |
attentions = getattr(outputs, "attentions", None)
|
| 408 |
|
| 409 |
# Classifier consumes (B, feat_dim) and returns logits (B, num_labels).
|
| 410 |
-
# classifier는 (B, feat_dim)을 받아 logits (B, num_labels)를
|
| 411 |
logits = self.classifier(feats)
|
| 412 |
|
| 413 |
loss = None
|
| 414 |
if labels is not None:
|
| 415 |
# Cross entropy expects labels as class indices in [0, num_labels).
|
| 416 |
-
# cross entropy는 labels가 [0, num_labels) 범위의 class index이길
|
| 417 |
loss = F.cross_entropy(logits, labels)
|
| 418 |
|
| 419 |
if not return_dict:
|
|
@@ -434,14 +438,14 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 434 |
# ============================================================
|
| 435 |
def _set_requires_grad(module: nn.Module, flag: bool):
|
| 436 |
# Toggle requires_grad for all parameters in a module.
|
| 437 |
-
# 모듈의 모든 파라미터에 대해 requires_grad를
|
| 438 |
for p in module.parameters():
|
| 439 |
p.requires_grad = flag
|
| 440 |
|
| 441 |
|
| 442 |
def set_bn_eval(module: nn.Module):
|
| 443 |
# Put BatchNorm layers into eval mode to freeze running stats.
|
| 444 |
-
# BatchNorm 레이어를 eval 모드로 두어 running stats를
|
| 445 |
for m in module.modules():
|
| 446 |
if isinstance(m, (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.SyncBatchNorm)):
|
| 447 |
m.eval()
|
|
@@ -449,7 +453,7 @@ def set_bn_eval(module: nn.Module):
|
|
| 449 |
|
| 450 |
def freeze_backbone(model: BackboneWithMLPHeadForImageClassification, freeze_bn: bool = True):
|
| 451 |
# Stage1: freeze backbone and train only the head.
|
| 452 |
-
# stage1: backbone을 freeze하고 head만
|
| 453 |
_set_requires_grad(model.backbone, False)
|
| 454 |
_set_requires_grad(model.classifier, True)
|
| 455 |
|
|
@@ -460,7 +464,7 @@ def freeze_backbone(model: BackboneWithMLPHeadForImageClassification, freeze_bn:
|
|
| 460 |
|
| 461 |
def finetune_train_mode(model: BackboneWithMLPHeadForImageClassification, keep_bn_eval: bool = True):
|
| 462 |
# Stage2: train mode, optionally keeping BN layers in eval for stability.
|
| 463 |
-
# stage2: train 모드로 두되 안정성을 위해 BN을 eval로 유지할 수
|
| 464 |
model.train()
|
| 465 |
meta = getattr(model, "_meta", None) or getattr(model.config, "backbone_meta", None)
|
| 466 |
if keep_bn_eval and meta.get("has_bn", False):
|
|
@@ -469,7 +473,7 @@ def finetune_train_mode(model: BackboneWithMLPHeadForImageClassification, keep_b
|
|
| 469 |
|
| 470 |
def trainable_summary(model: nn.Module):
|
| 471 |
# Print a compact summary of trainable parameters.
|
| 472 |
-
# 학습 가능 파라미터 요약을 간단히
|
| 473 |
total = sum(p.numel() for p in model.parameters())
|
| 474 |
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
| 475 |
ratio = trainable / total if total > 0 else 0.0
|
|
@@ -483,7 +487,7 @@ def unfreeze_last_stage(
|
|
| 483 |
keep_bn_eval: bool = True,
|
| 484 |
):
|
| 485 |
# This utility implements BACKBONE_META['unfreeze']=="last_n" across supported backbones.
|
| 486 |
-
# 이 유틸은 지원 백본들에 대해 BACKBONE_META['unfreeze']=="last_n"을
|
| 487 |
freeze_backbone(model, freeze_bn=keep_bn_eval)
|
| 488 |
|
| 489 |
n = int(last_n)
|
|
@@ -498,7 +502,7 @@ def unfreeze_last_stage(
|
|
| 498 |
|
| 499 |
if bb_type == "vit":
|
| 500 |
# ViT blocks live under backbone.encoder.layer in the transformers implementation.
|
| 501 |
-
# ViT 블록은 transformers 구현에서 backbone.encoder.layer 아래에
|
| 502 |
blocks = list(model.backbone.encoder.layer)
|
| 503 |
for blk in blocks[-n:]:
|
| 504 |
_set_requires_grad(blk, True)
|
|
@@ -506,7 +510,7 @@ def unfreeze_last_stage(
|
|
| 506 |
|
| 507 |
if bb_type == "swin":
|
| 508 |
# Swin blocks are nested by stages and blocks; we flatten and unfreeze last n blocks.
|
| 509 |
-
# Swin 블록은 stage와 block으로 중첩되어 있어 펼친 후 마지막 n개를 unfreeze
|
| 510 |
stages = list(model.backbone.encoder.layers)
|
| 511 |
blocks: List[nn.Module] = []
|
| 512 |
for st in stages:
|
|
@@ -517,7 +521,7 @@ def unfreeze_last_stage(
|
|
| 517 |
|
| 518 |
if bb_type == "resnet":
|
| 519 |
# ResNet uses layer1..layer4 stages; we unfreeze at block granularity.
|
| 520 |
-
# ResNet은 layer1..layer4 stage를 사용하며 block 단위로 unfreeze
|
| 521 |
bb = model.backbone
|
| 522 |
for name in ("layer1", "layer2", "layer3", "layer4"):
|
| 523 |
if not hasattr(bb, name):
|
|
@@ -538,7 +542,7 @@ def unfreeze_last_stage(
|
|
| 538 |
|
| 539 |
if bb_type == "efficientnet":
|
| 540 |
# EfficientNet in transformers exposes features; we unfreeze from the tail blocks.
|
| 541 |
-
# transformers EfficientNet은 features를 노출하며 뒤쪽 블록부터 unfreeze
|
| 542 |
bb = model.backbone
|
| 543 |
if not hasattr(bb, "features"):
|
| 544 |
raise RuntimeError("Unexpected EfficientNet structure: missing features")
|
|
@@ -556,7 +560,7 @@ def unfreeze_last_stage(
|
|
| 556 |
|
| 557 |
if bb_type in ("timm_densenet", "torchvision_densenet"):
|
| 558 |
# DenseNet exposes a .features module with named blocks; we unfreeze last n submodules.
|
| 559 |
-
# DenseNet은 .features 모듈에 블록들이 이름으로 존재하며 마지막 n개 서브모듈을 unfreeze
|
| 560 |
bb = model.backbone
|
| 561 |
if not hasattr(bb, "features"):
|
| 562 |
raise RuntimeError("Unexpected DenseNet: missing features")
|
|
@@ -575,7 +579,7 @@ def unfreeze_last_stage(
|
|
| 575 |
|
| 576 |
def _denselayers(db: nn.Module) -> List[nn.Module]:
|
| 577 |
# Dense blocks contain multiple DenseLayer children; we return them for fine-grained unfreezing.
|
| 578 |
-
# denseblock은 DenseLayer 자식들을 가지므로 세밀한 unfreeze를 위해 이를
|
| 579 |
return list(db.children())
|
| 580 |
|
| 581 |
blocks: List[nn.Module] = []
|
|
@@ -600,5 +604,5 @@ def unfreeze_last_stage(
|
|
| 600 |
# register
|
| 601 |
# -------------------------
|
| 602 |
# Register for AutoModelForImageClassification so from_pretrained can resolve this custom class.
|
| 603 |
-
# from_pretrained가 이 커스텀 클래스를 해석할 수 있도록 AutoModelForImageClassification에
|
| 604 |
BackboneWithMLPHeadForImageClassification.register_for_auto_class("AutoModelForImageClassification")
|
|
|
|
| 94 |
|
| 95 |
def __init__(self, config: BackboneMLPHeadConfig):
|
| 96 |
# PreTrainedModel expects a config object and stores it internally.
|
| 97 |
+
# PreTrainedModel은 config 객체를 받아 내부에 저장함.
|
| 98 |
super().__init__(config)
|
| 99 |
|
| 100 |
# Fail-fast: the model is not meant to be instantiated without a valid backbone id.
|
| 101 |
+
# fail-fast: 유효한 backbone id 없이 모델을 만드는 사용 시나리오는 허용하지 않음 - fast fail.
|
| 102 |
#
|
| 103 |
# Note: Transformers may create configs with no args, but models are conventionally created with configs.
|
| 104 |
+
# 참고: Transformers는 config 무인자 생성이 있을 수 있으나, 모델은 관례적으로 config를 받아 생성.
|
| 105 |
if config.backbone_name_or_path is None:
|
| 106 |
raise ValueError(
|
| 107 |
"config.backbone_name_or_path is None. "
|
|
|
|
| 109 |
)
|
| 110 |
|
| 111 |
# Fail-fast: training/inference requires a positive number of labels.
|
| 112 |
+
# fail-fast: 학습/추론은 num_labels가 양수여야 함.
|
| 113 |
#
|
| 114 |
# Config may exist in a minimal form for internal serialization paths, but the model should not.
|
| 115 |
+
# config는 내부 직렬화 경로에서 최소 형태로 존재할 수 있으나 모델은 해당 없음.
|
| 116 |
if int(getattr(config, "num_labels", 0)) <= 0:
|
| 117 |
raise ValueError(
|
| 118 |
f"config.num_labels must be > 0, got {getattr(config, 'num_labels', None)}. "
|
|
|
|
| 120 |
)
|
| 121 |
|
| 122 |
# Meta is a single source of truth for extraction and fine-tuning rules.
|
| 123 |
+
# meta는 feature 추출 및 미세조정 규칙의 단일 기준.
|
| 124 |
+
# Resolve backbone meta from config (preferred) or fallback table (for backward compatibility).
|
| 125 |
# Prefer config.backbone_meta to keep Hub runtime self-contained.
|
| 126 |
self._meta = _resolve_backbone_meta(config, fallback_table=BACKBONE_META)
|
| 127 |
|
| 128 |
# Backbone skeleton is always created without pretrained weights.
|
| 129 |
+
# backbone skeleton은 항상 pretrained weight 없이 생성.
|
| 130 |
self.backbone = self._build_backbone_skeleton(config.backbone_name_or_path)
|
| 131 |
|
| 132 |
# Head shape is driven by meta feat_dim and config.num_labels.
|
| 133 |
+
# head shape은 meta의 feat_dim과 config.num_labels로 결정.
|
| 134 |
self.classifier = MLPHead(
|
| 135 |
in_dim=int(self._meta["feat_dim"]),
|
| 136 |
num_labels=int(config.num_labels),
|
|
|
|
| 139 |
)
|
| 140 |
|
| 141 |
# HF initialization hook, but we override init_weights to initialize head-only.
|
| 142 |
+
# HF 초기화 훅이지만 init_weights를 override하여 head만 초기화하도록 변경.
|
| 143 |
self.post_init()
|
| 144 |
|
| 145 |
def init_weights(self):
|
| 146 |
"""
|
| 147 |
Initialize only the head to avoid touching the backbone skeleton.
|
| 148 |
+
backbone skeleton을 건드리지 않기 위해 head만 초기화.
|
| 149 |
|
| 150 |
HF's default init may traverse the entire module tree, which is undesirable here.
|
| 151 |
+
HF 기본 init은 전체 모듈 트리를 순회할 수 있어 여기서 그대로 사용하기 부적절.
|
| 152 |
+
|
| 153 |
+
초기 설계에서 __init__ 내부에서 backbone의 가중치 로드를 수행함(편리를 위해).
|
| 154 |
+
이 경우, HF의 post_init()으로 인해 해당 로드가 취소되는 경우가 존재(timm, torchvision 등의 백본).
|
| 155 |
+
때문에 이를 오버라이드 하여 classifier만 초기화 하도록 변경함.
|
| 156 |
"""
|
| 157 |
if getattr(self, "classifier", None) is not None:
|
| 158 |
self.classifier.apply(self._init_weights)
|
|
|
|
| 164 |
# ----------------------------
|
| 165 |
def _build_backbone_skeleton(self, backbone_id: str) -> nn.Module:
|
| 166 |
# Meta decides which loader path to use.
|
| 167 |
+
# meta가 어떤 로더 경로를 사용할지 결정.
|
| 168 |
meta = self._meta if backbone_id == self.config.backbone_name_or_path else BACKBONE_META.get(backbone_id)
|
| 169 |
if meta is None:
|
| 170 |
raise KeyError(f"Unknown backbone_id={backbone_id}. Provide backbone_meta in config or extend BACKBONE_META.")
|
|
|
|
| 178 |
return self._build_torchvision_densenet_skeleton(backbone_id)
|
| 179 |
|
| 180 |
# For transformers backbones: build a random-weight skeleton from config only.
|
| 181 |
+
# transformers 백본: config로부터 랜덤 초기화 skeleton만 생성.
|
| 182 |
bb_cfg = AutoConfig.from_pretrained(backbone_id)
|
| 183 |
return AutoModel.from_config(bb_cfg)
|
| 184 |
|
| 185 |
@staticmethod
|
| 186 |
def _build_timm_densenet_skeleton(hf_repo_id: str) -> nn.Module:
|
| 187 |
# timm is an optional dependency and should be imported lazily.
|
| 188 |
+
# timm은 옵션 의존성이므로 지연 import 수행.
|
| 189 |
try:
|
| 190 |
import timm
|
| 191 |
except Exception as e:
|
|
|
|
| 194 |
) from e
|
| 195 |
|
| 196 |
# Build structure only (pretrained=False) and remove classifier head (num_classes=0).
|
| 197 |
+
# 구조만 생성(pretrained=False)하고 분류기 head는 제거(num_classes=0).
|
| 198 |
return timm.create_model(
|
| 199 |
f"hf_hub:{hf_repo_id}",
|
| 200 |
pretrained=False,
|
|
|
|
| 204 |
@staticmethod
|
| 205 |
def _build_torchvision_densenet_skeleton(model_id: str) -> nn.Module:
|
| 206 |
# This project intentionally supports only torchvision/densenet121 in the 224 whitelist.
|
| 207 |
+
# 이 프로젝트는 224 화이트리스트에서 torchvision/densenet121만 의도적으로 지원.
|
| 208 |
if model_id != "torchvision/densenet121":
|
| 209 |
raise ValueError(f"Unsupported torchvision DenseNet id (224 whitelist only): {model_id}")
|
| 210 |
|
| 211 |
# Build structure only (weights=None) to avoid implicit pretrained loading.
|
| 212 |
+
# implicit pretrained 로드를 피하기 위해 구조만 생성(weights=None).
|
| 213 |
m = tv_models.densenet121(weights=None)
|
| 214 |
return m
|
| 215 |
|
|
|
|
| 226 |
):
|
| 227 |
"""
|
| 228 |
Fresh-start only: inject pretrained backbone weights into the skeleton.
|
| 229 |
+
fresh-start 전용: skeleton backbone에 pretrained 가중치를 주입.
|
| 230 |
|
| 231 |
Do NOT call this after from_pretrained() because it would overwrite checkpoint weights.
|
| 232 |
+
from_pretrained() 이후 호출하면 체크포인트 가중치를 덮어쓰므로 주의할 것.
|
| 233 |
"""
|
| 234 |
bb = self.config.backbone_name_or_path
|
| 235 |
meta = self._meta
|
|
|
|
| 244 |
return
|
| 245 |
|
| 246 |
# For transformers backbones, load a reference pretrained model and copy weights into our skeleton.
|
| 247 |
+
# transformers 백본은 reference pretrained 모델을 로드한 뒤 skeleton에 가중치를 복사.
|
| 248 |
ref = AutoModel.from_pretrained(
|
| 249 |
bb,
|
| 250 |
low_cpu_mem_usage=low_cpu_mem_usage,
|
|
|
|
| 252 |
)
|
| 253 |
|
| 254 |
# strict=False is used to tolerate harmless key differences across minor versions.
|
| 255 |
+
# strict=False는 마이너 버전 차이로 인한 무해한 키 차이를 허용하기 위해 사용.
|
| 256 |
self.backbone.load_state_dict(ref.state_dict(), strict=False)
|
| 257 |
del ref
|
| 258 |
|
| 259 |
@torch.no_grad()
|
| 260 |
def _load_timm_pretrained_into_skeleton_(self, hf_repo_id: str):
|
| 261 |
# timm must be present for timm backbones.
|
| 262 |
+
# timm 백본에��� timm 설치가 필요.
|
| 263 |
import timm
|
| 264 |
|
| 265 |
# Create a pretrained reference model and copy its weights strictly.
|
| 266 |
+
# pretrained reference 모델을 만들고 가중치를 strict하게 복사.
|
| 267 |
ref = timm.create_model(
|
| 268 |
f"hf_hub:{hf_repo_id}",
|
| 269 |
pretrained=True,
|
|
|
|
| 276 |
@torch.no_grad()
|
| 277 |
def _load_torchvision_pretrained_into_skeleton_(self, model_id: str):
|
| 278 |
# This project intentionally supports only torchvision/densenet121 in the 224 whitelist.
|
| 279 |
+
# 이 프로젝트는 224 화이트리스트에서 torchvision/densenet121만 지원.
|
| 280 |
if model_id != "torchvision/densenet121":
|
| 281 |
raise ValueError(f"Unsupported torchvision DenseNet id (224 whitelist only): {model_id}")
|
| 282 |
|
| 283 |
# Use torchvision's default pretrained weights for densenet121.
|
| 284 |
+
# torchvision의 densenet121 기본 pretrained weights를 사용.
|
| 285 |
ref = tv_models.densenet121(weights=tv_models.DenseNet121_Weights.DEFAULT).eval()
|
| 286 |
|
| 287 |
self.backbone.load_state_dict(ref.state_dict(), strict=True)
|
|
|
|
| 294 |
@staticmethod
|
| 295 |
def _pool_or_gap(outputs) -> torch.Tensor:
|
| 296 |
# Some transformers vision CNNs provide pooler_output explicitly.
|
| 297 |
+
# 일부 transformers vision CNN은 pooler_output을 명시적으로 제공.
|
| 298 |
if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
|
| 299 |
x = outputs.pooler_output
|
| 300 |
if x.dim() == 2:
|
|
|
|
| 304 |
raise RuntimeError(f"Unexpected pooler_output shape: {tuple(x.shape)}")
|
| 305 |
|
| 306 |
# Otherwise we expect a CNN-style last_hidden_state=(B,C,H,W) and apply GAP.
|
| 307 |
+
# 그렇지 않으면 CNN 스타일 last_hidden_state=(B,C,H,W)를 기대하고 GAP을 적용.
|
| 308 |
x = outputs.last_hidden_state
|
| 309 |
if x.dim() == 4:
|
| 310 |
return x.mean(dim=(2, 3))
|
|
|
|
| 316 |
|
| 317 |
def _extract_features(self, outputs, pixel_values: Optional[torch.Tensor] = None) -> torch.Tensor:
|
| 318 |
# Feature rule is defined by BACKBONE_META and must remain stable across saves/loads.
|
| 319 |
+
# feature 규칙은 BACKBONE_META로 정의되며 저장/로드 간 안정적 동작을 위해 제한된 모델만 사용.
|
| 320 |
rule = self._meta["feat_rule"]
|
| 321 |
|
| 322 |
if rule == "cls":
|
| 323 |
# ViT-style: use CLS token embedding from last_hidden_state.
|
| 324 |
+
# ViT 스타일: last_hidden_state에서 CLS 토큰 임베딩을 사용.
|
| 325 |
return outputs.last_hidden_state[:, 0, :]
|
| 326 |
|
| 327 |
if rule == "pool_or_mean":
|
| 328 |
# Swin-style: prefer pooler_output if present, else mean-pool over tokens.
|
| 329 |
+
# Swin 스타일: pooler_output이 있으면 우선 사용하고, 없으면 토큰 평균 풀링을 사용.
|
| 330 |
if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
|
| 331 |
return outputs.pooler_output
|
| 332 |
return outputs.last_hidden_state.mean(dim=1)
|
| 333 |
|
| 334 |
if rule == "pool_or_gap":
|
| 335 |
# CNN-style: use pooler_output if present, else GAP over spatial dims.
|
| 336 |
+
# CNN 스타일: pooler_output이 있으면 사용하고, 없으면 공간 차원 GAP을 사용.
|
| 337 |
return self._pool_or_gap(outputs)
|
| 338 |
|
| 339 |
if rule == "timm_gap":
|
| 340 |
# timm forward_features returns a feature map (B,C,H,W) which we GAP to (B,C).
|
| 341 |
+
# timm forward_features는 (B,C,H,W) feature map을 반환하며 이를 GAP으로 (B,C)로 변환.
|
| 342 |
if not isinstance(outputs, torch.Tensor):
|
| 343 |
raise TypeError(f"timm_gap expects Tensor features, got {type(outputs)}")
|
| 344 |
if outputs.dim() != 4:
|
|
|
|
| 347 |
|
| 348 |
if rule == "torchvision_densenet_gap":
|
| 349 |
# torchvision DenseNet features are feature maps (B,C,H,W) and require GAP.
|
| 350 |
+
# torchvision DenseNet features는 (B,C,H,W) feature map이며 GAP이 필요.
|
| 351 |
if not isinstance(outputs, torch.Tensor):
|
| 352 |
raise TypeError(f"torchvision_densenet_gap expects Tensor, got {type(outputs)}")
|
| 353 |
if outputs.dim() != 4:
|
|
|
|
| 366 |
**kwargs,
|
| 367 |
):
|
| 368 |
# Type decides the backbone forward path and output format.
|
| 369 |
+
# type이 backbone forward 경로 및 출력 포맷을 결정.
|
| 370 |
t = self._meta["type"]
|
| 371 |
|
| 372 |
if t == "timm_densenet":
|
|
|
|
| 398 |
|
| 399 |
else:
|
| 400 |
# Transformers vision models are called with pixel_values and return ModelOutput.
|
| 401 |
+
# transformers vision 모델은 pixel_values로 호출되며 ModelOutput을 반환.
|
| 402 |
outputs = self.backbone(
|
| 403 |
pixel_values=pixel_values,
|
| 404 |
output_attentions=output_attentions,
|
|
|
|
| 411 |
attentions = getattr(outputs, "attentions", None)
|
| 412 |
|
| 413 |
# Classifier consumes (B, feat_dim) and returns logits (B, num_labels).
|
| 414 |
+
# classifier는 (B, feat_dim)을 받아 logits (B, num_labels)를 반환.
|
| 415 |
logits = self.classifier(feats)
|
| 416 |
|
| 417 |
loss = None
|
| 418 |
if labels is not None:
|
| 419 |
# Cross entropy expects labels as class indices in [0, num_labels).
|
| 420 |
+
# cross entropy는 labels가 [0, num_labels) 범위의 class index이길 기대함.
|
| 421 |
loss = F.cross_entropy(logits, labels)
|
| 422 |
|
| 423 |
if not return_dict:
|
|
|
|
| 438 |
# ============================================================
|
| 439 |
def _set_requires_grad(module: nn.Module, flag: bool):
|
| 440 |
# Toggle requires_grad for all parameters in a module.
|
| 441 |
+
# 모듈의 모든 파라미터에 대해 requires_grad를 토글.
|
| 442 |
for p in module.parameters():
|
| 443 |
p.requires_grad = flag
|
| 444 |
|
| 445 |
|
| 446 |
def set_bn_eval(module: nn.Module):
|
| 447 |
# Put BatchNorm layers into eval mode to freeze running stats.
|
| 448 |
+
# BatchNorm 레이어를 eval 모드로 두어 running stats를 고정.
|
| 449 |
for m in module.modules():
|
| 450 |
if isinstance(m, (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.SyncBatchNorm)):
|
| 451 |
m.eval()
|
|
|
|
| 453 |
|
| 454 |
def freeze_backbone(model: BackboneWithMLPHeadForImageClassification, freeze_bn: bool = True):
|
| 455 |
# Stage1: freeze backbone and train only the head.
|
| 456 |
+
# stage1: backbone을 freeze하고 head만 학습.
|
| 457 |
_set_requires_grad(model.backbone, False)
|
| 458 |
_set_requires_grad(model.classifier, True)
|
| 459 |
|
|
|
|
| 464 |
|
| 465 |
def finetune_train_mode(model: BackboneWithMLPHeadForImageClassification, keep_bn_eval: bool = True):
|
| 466 |
# Stage2: train mode, optionally keeping BN layers in eval for stability.
|
| 467 |
+
# stage2: train 모드로 두되 안정성을 위해 BN을 eval로 유지할 수 있음. (buffer 등을 유지하기 위해)
|
| 468 |
model.train()
|
| 469 |
meta = getattr(model, "_meta", None) or getattr(model.config, "backbone_meta", None)
|
| 470 |
if keep_bn_eval and meta.get("has_bn", False):
|
|
|
|
| 473 |
|
| 474 |
def trainable_summary(model: nn.Module):
|
| 475 |
# Print a compact summary of trainable parameters.
|
| 476 |
+
# 학습 가능 파라미터 요약을 간단히 출력.
|
| 477 |
total = sum(p.numel() for p in model.parameters())
|
| 478 |
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
| 479 |
ratio = trainable / total if total > 0 else 0.0
|
|
|
|
| 487 |
keep_bn_eval: bool = True,
|
| 488 |
):
|
| 489 |
# This utility implements BACKBONE_META['unfreeze']=="last_n" across supported backbones.
|
| 490 |
+
# 이 유틸은 지원 백본들에 대해 BACKBONE_META['unfreeze']=="last_n"을 구현.
|
| 491 |
freeze_backbone(model, freeze_bn=keep_bn_eval)
|
| 492 |
|
| 493 |
n = int(last_n)
|
|
|
|
| 502 |
|
| 503 |
if bb_type == "vit":
|
| 504 |
# ViT blocks live under backbone.encoder.layer in the transformers implementation.
|
| 505 |
+
# ViT 블록은 transformers 구현에서 backbone.encoder.layer 아래에 존재함.
|
| 506 |
blocks = list(model.backbone.encoder.layer)
|
| 507 |
for blk in blocks[-n:]:
|
| 508 |
_set_requires_grad(blk, True)
|
|
|
|
| 510 |
|
| 511 |
if bb_type == "swin":
|
| 512 |
# Swin blocks are nested by stages and blocks; we flatten and unfreeze last n blocks.
|
| 513 |
+
# Swin 블록은 stage와 block으로 중첩되어 있어 펼친 후 마지막 n개를 unfreeze.
|
| 514 |
stages = list(model.backbone.encoder.layers)
|
| 515 |
blocks: List[nn.Module] = []
|
| 516 |
for st in stages:
|
|
|
|
| 521 |
|
| 522 |
if bb_type == "resnet":
|
| 523 |
# ResNet uses layer1..layer4 stages; we unfreeze at block granularity.
|
| 524 |
+
# ResNet은 layer1..layer4 stage를 사용하며 block 단위로 unfreeze.
|
| 525 |
bb = model.backbone
|
| 526 |
for name in ("layer1", "layer2", "layer3", "layer4"):
|
| 527 |
if not hasattr(bb, name):
|
|
|
|
| 542 |
|
| 543 |
if bb_type == "efficientnet":
|
| 544 |
# EfficientNet in transformers exposes features; we unfreeze from the tail blocks.
|
| 545 |
+
# transformers EfficientNet은 features를 노출하며 뒤쪽 블록부터 unfreeze.
|
| 546 |
bb = model.backbone
|
| 547 |
if not hasattr(bb, "features"):
|
| 548 |
raise RuntimeError("Unexpected EfficientNet structure: missing features")
|
|
|
|
| 560 |
|
| 561 |
if bb_type in ("timm_densenet", "torchvision_densenet"):
|
| 562 |
# DenseNet exposes a .features module with named blocks; we unfreeze last n submodules.
|
| 563 |
+
# DenseNet은 .features 모듈에 블록들이 이름으로 존재하며 마지막 n개 서브모듈을 unfreeze.
|
| 564 |
bb = model.backbone
|
| 565 |
if not hasattr(bb, "features"):
|
| 566 |
raise RuntimeError("Unexpected DenseNet: missing features")
|
|
|
|
| 579 |
|
| 580 |
def _denselayers(db: nn.Module) -> List[nn.Module]:
|
| 581 |
# Dense blocks contain multiple DenseLayer children; we return them for fine-grained unfreezing.
|
| 582 |
+
# denseblock은 DenseLayer 자식들을 가지므로 세밀한 unfreeze를 위해 이를 반환.
|
| 583 |
return list(db.children())
|
| 584 |
|
| 585 |
blocks: List[nn.Module] = []
|
|
|
|
| 604 |
# register
|
| 605 |
# -------------------------
|
| 606 |
# Register for AutoModelForImageClassification so from_pretrained can resolve this custom class.
|
| 607 |
+
# from_pretrained가 이 커스텀 클래스를 해석할 수 있도록 AutoModelForImageClassification에 등록.
|
| 608 |
BackboneWithMLPHeadForImageClassification.register_for_auto_class("AutoModelForImageClassification")
|
models/google__vit-base-patch16-224/ds_proc.py
CHANGED
|
@@ -4,8 +4,8 @@
|
|
| 4 |
# src/ds_proc.py
|
| 5 |
|
| 6 |
# ============================================================
|
| 7 |
-
#
|
| 8 |
-
#
|
| 9 |
# ============================================================
|
| 10 |
|
| 11 |
from typing import Any
|
|
@@ -27,41 +27,38 @@ except ImportError:
|
|
| 27 |
class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
| 28 |
"""
|
| 29 |
This processor performs image preprocessing and outputs {"pixel_values": ...}.
|
| 30 |
-
이 processor는 이미지 전처리를 수행하고 {"pixel_values": ...}를
|
| 31 |
|
| 32 |
Key requirements:
|
| 33 |
핵심 요구사항:
|
| 34 |
|
| 35 |
1) save_pretrained() must produce a JSON-serializable preprocessor_config.json.
|
| 36 |
-
save_pretrained()는 JSON 직렬화 가능한 preprocessor_config.json을 생성해야
|
| 37 |
-
|
| 38 |
2) Runtime-only objects (delegate processor, timm/torchvision transforms) must NOT be serialized.
|
| 39 |
-
런타임 객체(delegate processor, timm/torchvision transform)는 절대 직렬화하면 안
|
| 40 |
-
|
| 41 |
3) Runtime objects are rebuilt at init/load time based on backbone meta.
|
| 42 |
-
런타임 객체는 backbone meta에 따라 init/load 시점에
|
| 43 |
-
|
| 44 |
4) For reproducibility, use_fast must be explicitly persisted and honored on load.
|
| 45 |
-
재현성을 위해 use_fast는 명시적으로 저장되고, 로드시 반드시 반영되어야
|
| 46 |
"""
|
| 47 |
|
| 48 |
# HF vision models conventionally expect "pixel_values" as the primary input key.
|
| 49 |
-
# HF vision 모델은 관례적으로 입력 키로 "pixel_values"를
|
| 50 |
model_input_names = ["pixel_values"]
|
| 51 |
|
| 52 |
def __init__(
|
| 53 |
self,
|
| 54 |
backbone_name_or_path: BackboneID,
|
| 55 |
-
is_training: bool = False,
|
| 56 |
use_fast: bool = False,
|
| 57 |
**kwargs,
|
| 58 |
):
|
| 59 |
# ImageProcessingMixin stores extra kwargs and manages auto_map metadata.
|
| 60 |
-
# ImageProcessingMixin은 추가 kwargs를 저장하고 auto_map 메타를
|
| 61 |
super().__init__(**kwargs)
|
| 62 |
|
| 63 |
# Enforce whitelist via BACKBONE_META to keep behavior stable.
|
| 64 |
-
# 동작 안정성을 위해 BACKBONE_META 기반 화이트리스트를
|
| 65 |
if backbone_name_or_path not in BACKBONE_META:
|
| 66 |
raise ValueError(
|
| 67 |
f"Unsupported backbone_name_or_path={backbone_name_or_path}. "
|
|
@@ -69,23 +66,23 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 69 |
)
|
| 70 |
|
| 71 |
# Serializable fields only: these should appear in preprocessor_config.json.
|
| 72 |
-
# 직렬화 가능한 필드만: 이 값들만 preprocessor_config.json에 들어가야
|
| 73 |
self.backbone_name_or_path = backbone_name_or_path
|
| 74 |
self.is_training = bool(is_training)
|
| 75 |
|
| 76 |
# Reproducibility switch for transformers processors.
|
| 77 |
-
# transformers processor의 fast/slow 선택을 재현 가능하게
|
| 78 |
self.use_fast = bool(use_fast)
|
| 79 |
|
| 80 |
# Runtime-only fields: must never be serialized.
|
| 81 |
-
# 런타임 전용 필드: 절대 직렬화되면 안
|
| 82 |
self._meta = None
|
| 83 |
-
self._delegate
|
| 84 |
-
self._timm_transform
|
| 85 |
self._torchvision_transform = None
|
| 86 |
|
| 87 |
# Build runtime objects according to backbone type.
|
| 88 |
-
# backbone type에 따라 런타임 객체를
|
| 89 |
self._build_runtime()
|
| 90 |
|
| 91 |
# ============================================================
|
|
@@ -95,13 +92,13 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 95 |
def _build_runtime(self):
|
| 96 |
"""
|
| 97 |
Build runtime delegate/transform based on BACKBONE_META["type"].
|
| 98 |
-
BACKBONE_META["type"]에 따라 런타임 delegate/transform을
|
| 99 |
"""
|
| 100 |
meta = BACKBONE_META[self.backbone_name_or_path]
|
| 101 |
self._meta = meta
|
| 102 |
|
| 103 |
# Always reset runtime fields before rebuilding.
|
| 104 |
-
# 재구성 전 런타임 필드는 항상
|
| 105 |
self._delegate = None
|
| 106 |
self._timm_transform = None
|
| 107 |
self._torchvision_transform = None
|
|
@@ -110,7 +107,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 110 |
|
| 111 |
if t == "timm_densenet":
|
| 112 |
# timm DenseNet uses timm.data transforms for ImageNet-style preprocessing.
|
| 113 |
-
# timm DenseNet은 ImageNet 전처리를 위해 timm.data transform을
|
| 114 |
self._timm_transform = self._build_timm_transform(
|
| 115 |
backbone_id=self.backbone_name_or_path,
|
| 116 |
is_training=self.is_training,
|
|
@@ -119,17 +116,17 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 119 |
|
| 120 |
if t == "torchvision_densenet":
|
| 121 |
# torchvision DenseNet requires torchvision-style preprocessing (resize/crop/tensor/normalize).
|
| 122 |
-
# torchvision DenseNet은 torchvision 스타일 전처리(resize/crop/tensor/normalize)가
|
| 123 |
self._torchvision_transform = self._build_torchvision_densenet_transform(
|
| 124 |
is_training=self.is_training
|
| 125 |
)
|
| 126 |
return
|
| 127 |
|
| 128 |
# Default: transformers backbone delegates to its official AutoImageProcessor.
|
| 129 |
-
# 기본: transformers 백본은 공식 AutoImageProcessor에
|
| 130 |
#
|
| 131 |
# IMPORTANT:
|
| 132 |
-
# - use_fast는 transformers 기본값 변경에 흔들리지 않도록 반드시 명시적으로
|
| 133 |
self._delegate = AutoImageProcessor.from_pretrained(
|
| 134 |
self.backbone_name_or_path,
|
| 135 |
use_fast=self.use_fast,
|
|
@@ -140,7 +137,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 140 |
def _build_timm_transform(*, backbone_id: str, is_training: bool):
|
| 141 |
"""
|
| 142 |
Create timm transform without storing non-serializable objects in config.
|
| 143 |
-
비직렬화 객체를 config에 저장하지 않고 timm transform을
|
| 144 |
"""
|
| 145 |
try:
|
| 146 |
import timm
|
|
@@ -151,20 +148,20 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 151 |
) from e
|
| 152 |
|
| 153 |
# We only need model metadata to resolve data config, so pretrained=False is preferred.
|
| 154 |
-
# data config 추출만 필요하므로 pretrained=False를 우선
|
| 155 |
m = timm.create_model(f"hf_hub:{backbone_id}", pretrained=False, num_classes=0)
|
| 156 |
dc = resolve_model_data_config(m)
|
| 157 |
|
| 158 |
# create_transform returns a torchvision-like callable that maps PIL -> torch.Tensor(C,H,W).
|
| 159 |
-
# create_transform은 PIL -> torch.Tensor(C,H,W)로 매핑하는 callable을
|
| 160 |
-
tfm = create_transform(**dc, is_training=is_training)
|
| 161 |
return tfm
|
| 162 |
|
| 163 |
@staticmethod
|
| 164 |
def _build_torchvision_densenet_transform(*, is_training: bool):
|
| 165 |
"""
|
| 166 |
Build torchvision preprocessing for DenseNet-121 (224 pipeline).
|
| 167 |
-
DenseNet-121용 torchvision 전처리(224 파이프라인)를
|
| 168 |
"""
|
| 169 |
try:
|
| 170 |
from torchvision import transforms
|
|
@@ -174,28 +171,29 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 174 |
) from e
|
| 175 |
|
| 176 |
# These are the standard ImageNet normalization stats used by torchvision weights.
|
| 177 |
-
# 이 값들은 torchvision weights가 사용하는 표준 ImageNet 정규화
|
| 178 |
mean = (0.485, 0.456, 0.406)
|
| 179 |
-
std
|
| 180 |
|
| 181 |
# Training pipeline typically uses RandomResizedCrop and horizontal flip.
|
| 182 |
-
# 학습 파이프라인은 보통 RandomResizedCrop과 좌우반전을
|
| 183 |
if is_training:
|
| 184 |
return transforms.Compose(
|
| 185 |
[
|
| 186 |
-
transforms.RandomResizedCrop(224),
|
| 187 |
-
transforms.RandomHorizontalFlip(p=0.5),
|
|
|
|
| 188 |
transforms.ToTensor(),
|
| 189 |
transforms.Normalize(mean=mean, std=std),
|
| 190 |
]
|
| 191 |
)
|
| 192 |
|
| 193 |
# Inference pipeline typically uses Resize(256) + CenterCrop(224).
|
| 194 |
-
# 추론 파이프라인은 보통 Resize(256) + CenterCrop(224)를
|
| 195 |
return transforms.Compose(
|
| 196 |
[
|
| 197 |
transforms.Resize(256),
|
| 198 |
-
transforms.CenterCrop(224),
|
| 199 |
transforms.ToTensor(),
|
| 200 |
transforms.Normalize(mean=mean, std=std),
|
| 201 |
]
|
|
@@ -208,24 +206,24 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 208 |
def to_dict(self) -> dict[str, Any]:
|
| 209 |
"""
|
| 210 |
Return a JSON-serializable dict for preprocessor_config.json.
|
| 211 |
-
preprocessor_config.json에 들어갈 JSON 직렬화 dict를
|
| 212 |
|
| 213 |
Important: do not leak runtime objects into the serialized dict.
|
| 214 |
-
중요: 런타임 객체가 직렬화 dict에 섞이면 안
|
| 215 |
"""
|
| 216 |
# ImageProcessingMixin.to_dict() adds metadata such as image_processor_type/auto_map.
|
| 217 |
# ImageProcessingMixin.to_dict()는 image_processor_type/auto_map 같은 메타를 추가합니다.
|
| 218 |
d = super().to_dict()
|
| 219 |
|
| 220 |
# Force minimal stable fields for long-term compatibility.
|
| 221 |
-
# 장기 호환을 위해 최소 안정 필드를
|
| 222 |
-
d["image_processor_type"]
|
| 223 |
d["backbone_name_or_path"] = self.backbone_name_or_path
|
| 224 |
d["is_training"] = self.is_training
|
| 225 |
-
d["use_fast"]
|
| 226 |
|
| 227 |
# Remove any runtime-only fields defensively.
|
| 228 |
-
# 런타임 전용 필드는 보수적으로
|
| 229 |
for key in ["_meta", "_delegate", "_timm_transform", "_torchvision_transform"]:
|
| 230 |
d.pop(key, None)
|
| 231 |
|
|
@@ -235,14 +233,14 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 235 |
def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs):
|
| 236 |
"""
|
| 237 |
Standard load path used by BaseImageProcessor / AutoImageProcessor.
|
| 238 |
-
BaseImageProcessor / AutoImageProcessor가 사용하는 표준 로드
|
| 239 |
"""
|
| 240 |
backbone = image_processor_dict.get("backbone_name_or_path", None)
|
| 241 |
if backbone is None:
|
| 242 |
raise ValueError("preprocessor_config.json missing key: backbone_name_or_path")
|
| 243 |
|
| 244 |
is_training = bool(image_processor_dict.get("is_training", False))
|
| 245 |
-
use_fast
|
| 246 |
|
| 247 |
return cls(
|
| 248 |
backbone_name_or_path=backbone,
|
|
@@ -255,20 +253,20 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 255 |
def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
|
| 256 |
"""
|
| 257 |
Fallback path if AutoImageProcessor calls class.from_pretrained directly.
|
| 258 |
-
AutoImageProcessor가 class.from_pretrained를 직접 호출하는 경우를 대비한
|
| 259 |
|
| 260 |
Strategy:
|
| 261 |
전략:
|
| 262 |
|
| 263 |
- Read config.json via AutoConfig and recover backbone_name_or_path.
|
| 264 |
-
AutoConfig로 config.json을 읽고 backbone_name_or_path를
|
| 265 |
"""
|
| 266 |
|
| 267 |
# is_training is runtime-only and should default to False for inference/serving.
|
| 268 |
-
# is_training은 런타임 전용이며 추론/서빙 기본값은 False
|
| 269 |
#
|
| 270 |
# IMPORTANT:
|
| 271 |
-
# - use_fast는 kwargs로 전달될 수 있으므로, 있으면
|
| 272 |
use_fast = bool(kwargs.pop("use_fast", False))
|
| 273 |
|
| 274 |
kwargs.pop("trust_remote_code", None)
|
|
@@ -289,7 +287,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 289 |
@staticmethod
|
| 290 |
def _ensure_list(images: Any) -> list[Any]:
|
| 291 |
# Normalize scalar image input to a list for uniform processing.
|
| 292 |
-
# 단일 입력을 리스트로 정규화하여 동일한 처리 경로를
|
| 293 |
if isinstance(images, (list, tuple)):
|
| 294 |
return list(images)
|
| 295 |
return [images]
|
|
@@ -297,7 +295,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 297 |
@staticmethod
|
| 298 |
def _to_pil_rgb(x: Any):
|
| 299 |
# Convert common image inputs into PIL RGB images.
|
| 300 |
-
# 일반적인 입력을 PIL RGB 이미지로
|
| 301 |
from PIL import Image as PILImage
|
| 302 |
|
| 303 |
if isinstance(x, PILImage.Image):
|
|
@@ -314,17 +312,17 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 314 |
) -> dict[str, Any]:
|
| 315 |
"""
|
| 316 |
Convert images into {"pixel_values": Tensor/ndarray}.
|
| 317 |
-
이미지를 {"pixel_values": Tensor/ndarray}로
|
| 318 |
"""
|
| 319 |
images = self._ensure_list(images)
|
| 320 |
|
| 321 |
# Rebuild runtime if needed (e.g., right after deserialization).
|
| 322 |
-
# 직렬화 복원 직후 등 런타임이 비어있을 수 있으므로
|
| 323 |
if (self._delegate is None) and (self._timm_transform is None) and (self._torchvision_transform is None):
|
| 324 |
self._build_runtime()
|
| 325 |
|
| 326 |
# timm path: PIL -> torch.Tensor(C,H,W) normalized float32.
|
| 327 |
-
# timm 경로: PIL -> torch.Tensor(C,H,W) 정규화 float32
|
| 328 |
if self._timm_transform is not None:
|
| 329 |
pv: list[torch.Tensor] = []
|
| 330 |
for im in images:
|
|
@@ -337,7 +335,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 337 |
return self._format_return(pixel_values, return_tensors)
|
| 338 |
|
| 339 |
# torchvision path: PIL -> torch.Tensor(C,H,W) normalized float32.
|
| 340 |
-
# torchvision 경로: PIL -> torch.Tensor(C,H,W) 정규화 float32
|
| 341 |
if self._torchvision_transform is not None:
|
| 342 |
pv: list[torch.Tensor] = []
|
| 343 |
for im in images:
|
|
@@ -350,7 +348,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 350 |
return self._format_return(pixel_values, return_tensors)
|
| 351 |
|
| 352 |
# transformers delegate path: rely on official processor behavior.
|
| 353 |
-
# transformers 위임 경로: 공식 processor 동작을 그대로
|
| 354 |
if self._delegate is None:
|
| 355 |
raise RuntimeError("Processor runtime not built: delegate is None and no transforms are available.")
|
| 356 |
|
|
@@ -360,7 +358,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 360 |
def _format_return(pixel_values: torch.Tensor, return_tensors: str | TensorType | None) -> dict[str, Any]:
|
| 361 |
"""
|
| 362 |
Format pixel_values according to return_tensors.
|
| 363 |
-
return_tensors에 맞춰 pixel_values 반환 포맷을
|
| 364 |
"""
|
| 365 |
if return_tensors is None or return_tensors in ("pt", TensorType.PYTORCH):
|
| 366 |
return {"pixel_values": pixel_values}
|
|
@@ -370,6 +368,6 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 370 |
|
| 371 |
|
| 372 |
# Register this processor for AutoImageProcessor resolution.
|
| 373 |
-
# AutoImageProcessor 해석을 위해 이 processor를
|
| 374 |
if __name__ != "__main__":
|
| 375 |
BackboneMLPHead224ImageProcessor.register_for_auto_class("AutoImageProcessor")
|
|
|
|
| 4 |
# src/ds_proc.py
|
| 5 |
|
| 6 |
# ============================================================
|
| 7 |
+
# ImageProcessor (AutoImageProcessor integration)
|
| 8 |
+
# ImageProcessor (AutoImageProcessor 연동)
|
| 9 |
# ============================================================
|
| 10 |
|
| 11 |
from typing import Any
|
|
|
|
| 27 |
class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
| 28 |
"""
|
| 29 |
This processor performs image preprocessing and outputs {"pixel_values": ...}.
|
| 30 |
+
이 processor는 이미지 전처리를 수행하고 {"pixel_values": ...}를 반환함.
|
| 31 |
|
| 32 |
Key requirements:
|
| 33 |
핵심 요구사항:
|
| 34 |
|
| 35 |
1) save_pretrained() must produce a JSON-serializable preprocessor_config.json.
|
| 36 |
+
save_pretrained()는 JSON 직렬화 가능한 preprocessor_config.json을 생성해야 함.
|
|
|
|
| 37 |
2) Runtime-only objects (delegate processor, timm/torchvision transforms) must NOT be serialized.
|
| 38 |
+
런타임 객체(delegate processor, timm/torchvision transform)는 절대 직렬화하면 안 됨.
|
|
|
|
| 39 |
3) Runtime objects are rebuilt at init/load time based on backbone meta.
|
| 40 |
+
런타임 객체는 backbone meta에 따라 init/load 시점에 재구성.
|
|
|
|
| 41 |
4) For reproducibility, use_fast must be explicitly persisted and honored on load.
|
| 42 |
+
재현성을 위해 use_fast는 명시적으로 저장되고, 로드시 반드시 반영되어야 함.
|
| 43 |
"""
|
| 44 |
|
| 45 |
# HF vision models conventionally expect "pixel_values" as the primary input key.
|
| 46 |
+
# HF vision 모델은 관례적으로 입력 키로 "pixel_values"를 기대.
|
| 47 |
model_input_names = ["pixel_values"]
|
| 48 |
|
| 49 |
def __init__(
|
| 50 |
self,
|
| 51 |
backbone_name_or_path: BackboneID,
|
| 52 |
+
is_training: bool = False, # timm 에서 data augmentation 용.
|
| 53 |
use_fast: bool = False,
|
| 54 |
**kwargs,
|
| 55 |
):
|
| 56 |
# ImageProcessingMixin stores extra kwargs and manages auto_map metadata.
|
| 57 |
+
# ImageProcessingMixin은 추가 kwargs를 저장하고 auto_map 메타를 관리.
|
| 58 |
super().__init__(**kwargs)
|
| 59 |
|
| 60 |
# Enforce whitelist via BACKBONE_META to keep behavior stable.
|
| 61 |
+
# 동작 안정성을 위해 BACKBONE_META 기반 화이트리스트를 강제. - fast fail
|
| 62 |
if backbone_name_or_path not in BACKBONE_META:
|
| 63 |
raise ValueError(
|
| 64 |
f"Unsupported backbone_name_or_path={backbone_name_or_path}. "
|
|
|
|
| 66 |
)
|
| 67 |
|
| 68 |
# Serializable fields only: these should appear in preprocessor_config.json.
|
| 69 |
+
# 직렬화 가능한 필드만: 이 값들만 preprocessor_config.json에 들어가야 함
|
| 70 |
self.backbone_name_or_path = backbone_name_or_path
|
| 71 |
self.is_training = bool(is_training)
|
| 72 |
|
| 73 |
# Reproducibility switch for transformers processors.
|
| 74 |
+
# transformers processor의 fast/slow 선택을 재현 가능하게 고정.
|
| 75 |
self.use_fast = bool(use_fast)
|
| 76 |
|
| 77 |
# Runtime-only fields: must never be serialized.
|
| 78 |
+
# 런타임 전용 필드: 절대 직렬화되면 안 됨.
|
| 79 |
self._meta = None
|
| 80 |
+
self._delegate = None
|
| 81 |
+
self._timm_transform = None
|
| 82 |
self._torchvision_transform = None
|
| 83 |
|
| 84 |
# Build runtime objects according to backbone type.
|
| 85 |
+
# backbone type에 따라 런타임 객체를 구성.
|
| 86 |
self._build_runtime()
|
| 87 |
|
| 88 |
# ============================================================
|
|
|
|
| 92 |
def _build_runtime(self):
|
| 93 |
"""
|
| 94 |
Build runtime delegate/transform based on BACKBONE_META["type"].
|
| 95 |
+
BACKBONE_META["type"]에 따라 런타임 delegate/transform을 구성.
|
| 96 |
"""
|
| 97 |
meta = BACKBONE_META[self.backbone_name_or_path]
|
| 98 |
self._meta = meta
|
| 99 |
|
| 100 |
# Always reset runtime fields before rebuilding.
|
| 101 |
+
# 재구성 전 런타임 필드는 항상 초기화.
|
| 102 |
self._delegate = None
|
| 103 |
self._timm_transform = None
|
| 104 |
self._torchvision_transform = None
|
|
|
|
| 107 |
|
| 108 |
if t == "timm_densenet":
|
| 109 |
# timm DenseNet uses timm.data transforms for ImageNet-style preprocessing.
|
| 110 |
+
# timm DenseNet은 ImageNet 전처리를 위해 timm.data transform을 사용.
|
| 111 |
self._timm_transform = self._build_timm_transform(
|
| 112 |
backbone_id=self.backbone_name_or_path,
|
| 113 |
is_training=self.is_training,
|
|
|
|
| 116 |
|
| 117 |
if t == "torchvision_densenet":
|
| 118 |
# torchvision DenseNet requires torchvision-style preprocessing (resize/crop/tensor/normalize).
|
| 119 |
+
# torchvision DenseNet은 torchvision 스타일 전처리(resize/crop/tensor/normalize)가 필요.
|
| 120 |
self._torchvision_transform = self._build_torchvision_densenet_transform(
|
| 121 |
is_training=self.is_training
|
| 122 |
)
|
| 123 |
return
|
| 124 |
|
| 125 |
# Default: transformers backbone delegates to its official AutoImageProcessor.
|
| 126 |
+
# 기본: transformers 백본은 공식 AutoImageProcessor에 위임.
|
| 127 |
#
|
| 128 |
# IMPORTANT:
|
| 129 |
+
# - use_fast는 transformers 기본값 변경에 흔들리지 않도록 반드시 명시적으로 전달.
|
| 130 |
self._delegate = AutoImageProcessor.from_pretrained(
|
| 131 |
self.backbone_name_or_path,
|
| 132 |
use_fast=self.use_fast,
|
|
|
|
| 137 |
def _build_timm_transform(*, backbone_id: str, is_training: bool):
|
| 138 |
"""
|
| 139 |
Create timm transform without storing non-serializable objects in config.
|
| 140 |
+
비직렬화 객체를 config에 저장하지 않고 timm transform을 생성.
|
| 141 |
"""
|
| 142 |
try:
|
| 143 |
import timm
|
|
|
|
| 148 |
) from e
|
| 149 |
|
| 150 |
# We only need model metadata to resolve data config, so pretrained=False is preferred.
|
| 151 |
+
# data config 추출만 필요하므로 pretrained=False를 우선 사용.
|
| 152 |
m = timm.create_model(f"hf_hub:{backbone_id}", pretrained=False, num_classes=0)
|
| 153 |
dc = resolve_model_data_config(m)
|
| 154 |
|
| 155 |
# create_transform returns a torchvision-like callable that maps PIL -> torch.Tensor(C,H,W).
|
| 156 |
+
# create_transform은 PIL -> torch.Tensor(C,H,W)로 매핑하는 callable을 반환.
|
| 157 |
+
tfm = create_transform(**dc, is_training=is_training) # is_training :Data Aug.
|
| 158 |
return tfm
|
| 159 |
|
| 160 |
@staticmethod
|
| 161 |
def _build_torchvision_densenet_transform(*, is_training: bool):
|
| 162 |
"""
|
| 163 |
Build torchvision preprocessing for DenseNet-121 (224 pipeline).
|
| 164 |
+
DenseNet-121용 torchvision 전처리(224 파이프라인)를 구성.
|
| 165 |
"""
|
| 166 |
try:
|
| 167 |
from torchvision import transforms
|
|
|
|
| 171 |
) from e
|
| 172 |
|
| 173 |
# These are the standard ImageNet normalization stats used by torchvision weights.
|
| 174 |
+
# 이 값들은 torchvision weights가 사용하는 표준 ImageNet 정규화 통계.
|
| 175 |
mean = (0.485, 0.456, 0.406)
|
| 176 |
+
std = (0.229, 0.224, 0.225)
|
| 177 |
|
| 178 |
# Training pipeline typically uses RandomResizedCrop and horizontal flip.
|
| 179 |
+
# 학습 파이프라인은 보통 RandomResizedCrop과 좌우반전을 사용.
|
| 180 |
if is_training:
|
| 181 |
return transforms.Compose(
|
| 182 |
[
|
| 183 |
+
# transforms.RandomResizedCrop(224),
|
| 184 |
+
# transforms.RandomHorizontalFlip(p=0.5),
|
| 185 |
+
transforms.Resize(224),
|
| 186 |
transforms.ToTensor(),
|
| 187 |
transforms.Normalize(mean=mean, std=std),
|
| 188 |
]
|
| 189 |
)
|
| 190 |
|
| 191 |
# Inference pipeline typically uses Resize(256) + CenterCrop(224).
|
| 192 |
+
# 추론 파이프라인은 보통 Resize(256) + CenterCrop(224)를 사용.
|
| 193 |
return transforms.Compose(
|
| 194 |
[
|
| 195 |
transforms.Resize(256),
|
| 196 |
+
# transforms.CenterCrop(224),
|
| 197 |
transforms.ToTensor(),
|
| 198 |
transforms.Normalize(mean=mean, std=std),
|
| 199 |
]
|
|
|
|
| 206 |
def to_dict(self) -> dict[str, Any]:
|
| 207 |
"""
|
| 208 |
Return a JSON-serializable dict for preprocessor_config.json.
|
| 209 |
+
preprocessor_config.json에 들어갈 JSON 직렬화 dict를 반환.
|
| 210 |
|
| 211 |
Important: do not leak runtime objects into the serialized dict.
|
| 212 |
+
중요: 런타임 객체가 직렬화 dict에 섞이면 안 됨.
|
| 213 |
"""
|
| 214 |
# ImageProcessingMixin.to_dict() adds metadata such as image_processor_type/auto_map.
|
| 215 |
# ImageProcessingMixin.to_dict()는 image_processor_type/auto_map 같은 메타를 추가합니다.
|
| 216 |
d = super().to_dict()
|
| 217 |
|
| 218 |
# Force minimal stable fields for long-term compatibility.
|
| 219 |
+
# 장기 호환을 위해 최소 안정 필드를 강제로 지정.
|
| 220 |
+
d["image_processor_type"] = self.__class__.__name__
|
| 221 |
d["backbone_name_or_path"] = self.backbone_name_or_path
|
| 222 |
d["is_training"] = self.is_training
|
| 223 |
+
d["use_fast"] = self.use_fast
|
| 224 |
|
| 225 |
# Remove any runtime-only fields defensively.
|
| 226 |
+
# 런타임 전용 필드는 보수적으로 제거.
|
| 227 |
for key in ["_meta", "_delegate", "_timm_transform", "_torchvision_transform"]:
|
| 228 |
d.pop(key, None)
|
| 229 |
|
|
|
|
| 233 |
def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs):
|
| 234 |
"""
|
| 235 |
Standard load path used by BaseImageProcessor / AutoImageProcessor.
|
| 236 |
+
BaseImageProcessor / AutoImageProcessor가 사용하는 표준 로드 경로임.
|
| 237 |
"""
|
| 238 |
backbone = image_processor_dict.get("backbone_name_or_path", None)
|
| 239 |
if backbone is None:
|
| 240 |
raise ValueError("preprocessor_config.json missing key: backbone_name_or_path")
|
| 241 |
|
| 242 |
is_training = bool(image_processor_dict.get("is_training", False))
|
| 243 |
+
use_fast = bool(image_processor_dict.get("use_fast", False))
|
| 244 |
|
| 245 |
return cls(
|
| 246 |
backbone_name_or_path=backbone,
|
|
|
|
| 253 |
def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
|
| 254 |
"""
|
| 255 |
Fallback path if AutoImageProcessor calls class.from_pretrained directly.
|
| 256 |
+
AutoImageProcessor가 class.from_pretrained를 직접 호출하는 경우를 대비한 메서드.
|
| 257 |
|
| 258 |
Strategy:
|
| 259 |
전략:
|
| 260 |
|
| 261 |
- Read config.json via AutoConfig and recover backbone_name_or_path.
|
| 262 |
+
AutoConfig로 config.json을 읽고 backbone_name_or_path를 복구.
|
| 263 |
"""
|
| 264 |
|
| 265 |
# is_training is runtime-only and should default to False for inference/serving.
|
| 266 |
+
# is_training은 런타임 전용이며 추론/서빙 기본값은 False 임.
|
| 267 |
#
|
| 268 |
# IMPORTANT:
|
| 269 |
+
# - use_fast는 kwargs로 전달될 수 있으므로, 있으면 반영.
|
| 270 |
use_fast = bool(kwargs.pop("use_fast", False))
|
| 271 |
|
| 272 |
kwargs.pop("trust_remote_code", None)
|
|
|
|
| 287 |
@staticmethod
|
| 288 |
def _ensure_list(images: Any) -> list[Any]:
|
| 289 |
# Normalize scalar image input to a list for uniform processing.
|
| 290 |
+
# 단일 입력을 리스트로 정규화하여 동일한 처리 경로를 사용.
|
| 291 |
if isinstance(images, (list, tuple)):
|
| 292 |
return list(images)
|
| 293 |
return [images]
|
|
|
|
| 295 |
@staticmethod
|
| 296 |
def _to_pil_rgb(x: Any):
|
| 297 |
# Convert common image inputs into PIL RGB images.
|
| 298 |
+
# 일반적인 입력을 PIL RGB 이미지로 변환.
|
| 299 |
from PIL import Image as PILImage
|
| 300 |
|
| 301 |
if isinstance(x, PILImage.Image):
|
|
|
|
| 312 |
) -> dict[str, Any]:
|
| 313 |
"""
|
| 314 |
Convert images into {"pixel_values": Tensor/ndarray}.
|
| 315 |
+
이미지를 {"pixel_values": Tensor/ndarray}로 변환.
|
| 316 |
"""
|
| 317 |
images = self._ensure_list(images)
|
| 318 |
|
| 319 |
# Rebuild runtime if needed (e.g., right after deserialization).
|
| 320 |
+
# 직렬화 복원 직후 등 런타임이 비어있을 수 있으므로 재구성.
|
| 321 |
if (self._delegate is None) and (self._timm_transform is None) and (self._torchvision_transform is None):
|
| 322 |
self._build_runtime()
|
| 323 |
|
| 324 |
# timm path: PIL -> torch.Tensor(C,H,W) normalized float32.
|
| 325 |
+
# timm 경로: PIL -> torch.Tensor(C,H,W) 정규화 float32.
|
| 326 |
if self._timm_transform is not None:
|
| 327 |
pv: list[torch.Tensor] = []
|
| 328 |
for im in images:
|
|
|
|
| 335 |
return self._format_return(pixel_values, return_tensors)
|
| 336 |
|
| 337 |
# torchvision path: PIL -> torch.Tensor(C,H,W) normalized float32.
|
| 338 |
+
# torchvision 경로: PIL -> torch.Tensor(C,H,W) 정규화 float32.
|
| 339 |
if self._torchvision_transform is not None:
|
| 340 |
pv: list[torch.Tensor] = []
|
| 341 |
for im in images:
|
|
|
|
| 348 |
return self._format_return(pixel_values, return_tensors)
|
| 349 |
|
| 350 |
# transformers delegate path: rely on official processor behavior.
|
| 351 |
+
# transformers 위임 경로: 공식 processor 동작을 그대로 사용.
|
| 352 |
if self._delegate is None:
|
| 353 |
raise RuntimeError("Processor runtime not built: delegate is None and no transforms are available.")
|
| 354 |
|
|
|
|
| 358 |
def _format_return(pixel_values: torch.Tensor, return_tensors: str | TensorType | None) -> dict[str, Any]:
|
| 359 |
"""
|
| 360 |
Format pixel_values according to return_tensors.
|
| 361 |
+
return_tensors에 맞춰 pixel_values 반환 포맷을 변환.
|
| 362 |
"""
|
| 363 |
if return_tensors is None or return_tensors in ("pt", TensorType.PYTORCH):
|
| 364 |
return {"pixel_values": pixel_values}
|
|
|
|
| 368 |
|
| 369 |
|
| 370 |
# Register this processor for AutoImageProcessor resolution.
|
| 371 |
+
# AutoImageProcessor 해석을 위해 이 processor를 등록.
|
| 372 |
if __name__ != "__main__":
|
| 373 |
BackboneMLPHead224ImageProcessor.register_for_auto_class("AutoImageProcessor")
|
models/google__vit-base-patch16-224/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 346372132
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8ecc8942482f4acdd221a9490549b44ab8d697b3b170cacce759bfd0e215e5df
|
| 3 |
size 346372132
|
models/microsoft__resnet-50/config.json
CHANGED
|
@@ -24,7 +24,7 @@
|
|
| 24 |
"num_labels": 3,
|
| 25 |
"transformers_version": "5.1.0",
|
| 26 |
"ds_provenance": {
|
| 27 |
-
"created_at": "
|
| 28 |
"repo_id": "dsaint31/bb_mlp_224",
|
| 29 |
"subdir": "models/microsoft__resnet-50",
|
| 30 |
"wrapper_class": "BackboneWithMLPHeadForImageClassification",
|
|
|
|
| 24 |
"num_labels": 3,
|
| 25 |
"transformers_version": "5.1.0",
|
| 26 |
"ds_provenance": {
|
| 27 |
+
"created_at": "20260212_202546",
|
| 28 |
"repo_id": "dsaint31/bb_mlp_224",
|
| 29 |
"subdir": "models/microsoft__resnet-50",
|
| 30 |
"wrapper_class": "BackboneWithMLPHeadForImageClassification",
|
models/microsoft__resnet-50/ds_model.py
CHANGED
|
@@ -94,14 +94,14 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 94 |
|
| 95 |
def __init__(self, config: BackboneMLPHeadConfig):
|
| 96 |
# PreTrainedModel expects a config object and stores it internally.
|
| 97 |
-
# PreTrainedModel은 config 객체를 받아 내부에
|
| 98 |
super().__init__(config)
|
| 99 |
|
| 100 |
# Fail-fast: the model is not meant to be instantiated without a valid backbone id.
|
| 101 |
-
# fail-fast: 유효한 backbone id 없이 모델을 만드는 사용 시나리오는 허용하지
|
| 102 |
#
|
| 103 |
# Note: Transformers may create configs with no args, but models are conventionally created with configs.
|
| 104 |
-
# 참고: Transformers는 config 무인자 생성이 있을 수 있으나, 모델은 관례적으로 config를 받아
|
| 105 |
if config.backbone_name_or_path is None:
|
| 106 |
raise ValueError(
|
| 107 |
"config.backbone_name_or_path is None. "
|
|
@@ -109,10 +109,10 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 109 |
)
|
| 110 |
|
| 111 |
# Fail-fast: training/inference requires a positive number of labels.
|
| 112 |
-
# fail-fast: 학습/추론은 num_labels가 양수여야
|
| 113 |
#
|
| 114 |
# Config may exist in a minimal form for internal serialization paths, but the model should not.
|
| 115 |
-
# config는 내부 직렬화 경로에서 최소 형태로 존재할 수 있으나 모델은
|
| 116 |
if int(getattr(config, "num_labels", 0)) <= 0:
|
| 117 |
raise ValueError(
|
| 118 |
f"config.num_labels must be > 0, got {getattr(config, 'num_labels', None)}. "
|
|
@@ -120,17 +120,17 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 120 |
)
|
| 121 |
|
| 122 |
# Meta is a single source of truth for extraction and fine-tuning rules.
|
| 123 |
-
# meta는 feature 추출 및 미세조정 규칙의 단일
|
| 124 |
-
|
| 125 |
# Prefer config.backbone_meta to keep Hub runtime self-contained.
|
| 126 |
self._meta = _resolve_backbone_meta(config, fallback_table=BACKBONE_META)
|
| 127 |
|
| 128 |
# Backbone skeleton is always created without pretrained weights.
|
| 129 |
-
# backbone skeleton은 항상 pretrained weight 없이
|
| 130 |
self.backbone = self._build_backbone_skeleton(config.backbone_name_or_path)
|
| 131 |
|
| 132 |
# Head shape is driven by meta feat_dim and config.num_labels.
|
| 133 |
-
# head shape은 meta의 feat_dim과 config.num_labels로
|
| 134 |
self.classifier = MLPHead(
|
| 135 |
in_dim=int(self._meta["feat_dim"]),
|
| 136 |
num_labels=int(config.num_labels),
|
|
@@ -139,16 +139,20 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 139 |
)
|
| 140 |
|
| 141 |
# HF initialization hook, but we override init_weights to initialize head-only.
|
| 142 |
-
# HF 초기화 훅이지만 init_weights를 override하여 head만
|
| 143 |
self.post_init()
|
| 144 |
|
| 145 |
def init_weights(self):
|
| 146 |
"""
|
| 147 |
Initialize only the head to avoid touching the backbone skeleton.
|
| 148 |
-
backbone skeleton을 건드리지 않기 위해 head만
|
| 149 |
|
| 150 |
HF's default init may traverse the entire module tree, which is undesirable here.
|
| 151 |
-
HF 기본 init은 전체 모듈 트리를 순회할 수 있어
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
"""
|
| 153 |
if getattr(self, "classifier", None) is not None:
|
| 154 |
self.classifier.apply(self._init_weights)
|
|
@@ -160,7 +164,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 160 |
# ----------------------------
|
| 161 |
def _build_backbone_skeleton(self, backbone_id: str) -> nn.Module:
|
| 162 |
# Meta decides which loader path to use.
|
| 163 |
-
# meta가 어떤 로더 경로를 사용할지
|
| 164 |
meta = self._meta if backbone_id == self.config.backbone_name_or_path else BACKBONE_META.get(backbone_id)
|
| 165 |
if meta is None:
|
| 166 |
raise KeyError(f"Unknown backbone_id={backbone_id}. Provide backbone_meta in config or extend BACKBONE_META.")
|
|
@@ -174,14 +178,14 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 174 |
return self._build_torchvision_densenet_skeleton(backbone_id)
|
| 175 |
|
| 176 |
# For transformers backbones: build a random-weight skeleton from config only.
|
| 177 |
-
# transformers 백본: config로부터 랜덤 초기화 skeleton만
|
| 178 |
bb_cfg = AutoConfig.from_pretrained(backbone_id)
|
| 179 |
return AutoModel.from_config(bb_cfg)
|
| 180 |
|
| 181 |
@staticmethod
|
| 182 |
def _build_timm_densenet_skeleton(hf_repo_id: str) -> nn.Module:
|
| 183 |
# timm is an optional dependency and should be imported lazily.
|
| 184 |
-
# timm은 옵션 의존성이므로 지연 import
|
| 185 |
try:
|
| 186 |
import timm
|
| 187 |
except Exception as e:
|
|
@@ -190,7 +194,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 190 |
) from e
|
| 191 |
|
| 192 |
# Build structure only (pretrained=False) and remove classifier head (num_classes=0).
|
| 193 |
-
# 구조만 생성(pretrained=False)하고 분류기 head는 제거(num_classes=0)
|
| 194 |
return timm.create_model(
|
| 195 |
f"hf_hub:{hf_repo_id}",
|
| 196 |
pretrained=False,
|
|
@@ -200,12 +204,12 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 200 |
@staticmethod
|
| 201 |
def _build_torchvision_densenet_skeleton(model_id: str) -> nn.Module:
|
| 202 |
# This project intentionally supports only torchvision/densenet121 in the 224 whitelist.
|
| 203 |
-
# 이 프로젝트는 224 화이트리스트에서 torchvision/densenet121만 의도적으로
|
| 204 |
if model_id != "torchvision/densenet121":
|
| 205 |
raise ValueError(f"Unsupported torchvision DenseNet id (224 whitelist only): {model_id}")
|
| 206 |
|
| 207 |
# Build structure only (weights=None) to avoid implicit pretrained loading.
|
| 208 |
-
# implicit pretrained 로드를 피하기 위해 구조만 생성(weights=None)
|
| 209 |
m = tv_models.densenet121(weights=None)
|
| 210 |
return m
|
| 211 |
|
|
@@ -222,10 +226,10 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 222 |
):
|
| 223 |
"""
|
| 224 |
Fresh-start only: inject pretrained backbone weights into the skeleton.
|
| 225 |
-
fresh-start 전용: skeleton backbone에 pretrained 가중치를
|
| 226 |
|
| 227 |
Do NOT call this after from_pretrained() because it would overwrite checkpoint weights.
|
| 228 |
-
from_pretrained() 이후 호출하면 체크포인트 가중치를 덮어쓰므로
|
| 229 |
"""
|
| 230 |
bb = self.config.backbone_name_or_path
|
| 231 |
meta = self._meta
|
|
@@ -240,7 +244,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 240 |
return
|
| 241 |
|
| 242 |
# For transformers backbones, load a reference pretrained model and copy weights into our skeleton.
|
| 243 |
-
# transformers 백본은 reference pretrained 모델을 로드한 뒤 skeleton에 가중치를
|
| 244 |
ref = AutoModel.from_pretrained(
|
| 245 |
bb,
|
| 246 |
low_cpu_mem_usage=low_cpu_mem_usage,
|
|
@@ -248,18 +252,18 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 248 |
)
|
| 249 |
|
| 250 |
# strict=False is used to tolerate harmless key differences across minor versions.
|
| 251 |
-
# strict=False는 마이너 버전 차이로 인한 무해한 키 차이를 허용하기 위해
|
| 252 |
self.backbone.load_state_dict(ref.state_dict(), strict=False)
|
| 253 |
del ref
|
| 254 |
|
| 255 |
@torch.no_grad()
|
| 256 |
def _load_timm_pretrained_into_skeleton_(self, hf_repo_id: str):
|
| 257 |
# timm must be present for timm backbones.
|
| 258 |
-
# timm
|
| 259 |
import timm
|
| 260 |
|
| 261 |
# Create a pretrained reference model and copy its weights strictly.
|
| 262 |
-
# pretrained reference 모델을 만들고 가중치를 strict하게
|
| 263 |
ref = timm.create_model(
|
| 264 |
f"hf_hub:{hf_repo_id}",
|
| 265 |
pretrained=True,
|
|
@@ -272,12 +276,12 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 272 |
@torch.no_grad()
|
| 273 |
def _load_torchvision_pretrained_into_skeleton_(self, model_id: str):
|
| 274 |
# This project intentionally supports only torchvision/densenet121 in the 224 whitelist.
|
| 275 |
-
# 이 프로젝트는 224 화이트리스트에서 torchvision/densenet121만
|
| 276 |
if model_id != "torchvision/densenet121":
|
| 277 |
raise ValueError(f"Unsupported torchvision DenseNet id (224 whitelist only): {model_id}")
|
| 278 |
|
| 279 |
# Use torchvision's default pretrained weights for densenet121.
|
| 280 |
-
# torchvision의 densenet121 기본 pretrained weights를
|
| 281 |
ref = tv_models.densenet121(weights=tv_models.DenseNet121_Weights.DEFAULT).eval()
|
| 282 |
|
| 283 |
self.backbone.load_state_dict(ref.state_dict(), strict=True)
|
|
@@ -290,7 +294,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 290 |
@staticmethod
|
| 291 |
def _pool_or_gap(outputs) -> torch.Tensor:
|
| 292 |
# Some transformers vision CNNs provide pooler_output explicitly.
|
| 293 |
-
# 일부 transformers vision CNN은 pooler_output을 명시적으로
|
| 294 |
if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
|
| 295 |
x = outputs.pooler_output
|
| 296 |
if x.dim() == 2:
|
|
@@ -300,7 +304,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 300 |
raise RuntimeError(f"Unexpected pooler_output shape: {tuple(x.shape)}")
|
| 301 |
|
| 302 |
# Otherwise we expect a CNN-style last_hidden_state=(B,C,H,W) and apply GAP.
|
| 303 |
-
# 그렇지 않으면 CNN 스타일 last_hidden_state=(B,C,H,W)를 기대하고 GAP을
|
| 304 |
x = outputs.last_hidden_state
|
| 305 |
if x.dim() == 4:
|
| 306 |
return x.mean(dim=(2, 3))
|
|
@@ -312,29 +316,29 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 312 |
|
| 313 |
def _extract_features(self, outputs, pixel_values: Optional[torch.Tensor] = None) -> torch.Tensor:
|
| 314 |
# Feature rule is defined by BACKBONE_META and must remain stable across saves/loads.
|
| 315 |
-
# feature 규칙은 BACKBONE_META로 정의되며 저장/로드 간
|
| 316 |
rule = self._meta["feat_rule"]
|
| 317 |
|
| 318 |
if rule == "cls":
|
| 319 |
# ViT-style: use CLS token embedding from last_hidden_state.
|
| 320 |
-
# ViT 스타일: last_hidden_state에서 CLS 토큰 임베딩을
|
| 321 |
return outputs.last_hidden_state[:, 0, :]
|
| 322 |
|
| 323 |
if rule == "pool_or_mean":
|
| 324 |
# Swin-style: prefer pooler_output if present, else mean-pool over tokens.
|
| 325 |
-
# Swin 스타일: pooler_output이 있으면 우선 사용하고, 없으면 토큰 평균 풀링을
|
| 326 |
if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
|
| 327 |
return outputs.pooler_output
|
| 328 |
return outputs.last_hidden_state.mean(dim=1)
|
| 329 |
|
| 330 |
if rule == "pool_or_gap":
|
| 331 |
# CNN-style: use pooler_output if present, else GAP over spatial dims.
|
| 332 |
-
# CNN 스타일: pooler_output이 있으면 사용하고, 없으면 공간 차원 GAP을
|
| 333 |
return self._pool_or_gap(outputs)
|
| 334 |
|
| 335 |
if rule == "timm_gap":
|
| 336 |
# timm forward_features returns a feature map (B,C,H,W) which we GAP to (B,C).
|
| 337 |
-
# timm forward_features는 (B,C,H,W) feature map을 반환하며 이를 GAP으로 (B,C)로
|
| 338 |
if not isinstance(outputs, torch.Tensor):
|
| 339 |
raise TypeError(f"timm_gap expects Tensor features, got {type(outputs)}")
|
| 340 |
if outputs.dim() != 4:
|
|
@@ -343,7 +347,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 343 |
|
| 344 |
if rule == "torchvision_densenet_gap":
|
| 345 |
# torchvision DenseNet features are feature maps (B,C,H,W) and require GAP.
|
| 346 |
-
# torchvision DenseNet features는 (B,C,H,W) feature map이며 GAP이
|
| 347 |
if not isinstance(outputs, torch.Tensor):
|
| 348 |
raise TypeError(f"torchvision_densenet_gap expects Tensor, got {type(outputs)}")
|
| 349 |
if outputs.dim() != 4:
|
|
@@ -362,7 +366,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 362 |
**kwargs,
|
| 363 |
):
|
| 364 |
# Type decides the backbone forward path and output format.
|
| 365 |
-
# type이 backbone forward 경로 및 출력 포맷을
|
| 366 |
t = self._meta["type"]
|
| 367 |
|
| 368 |
if t == "timm_densenet":
|
|
@@ -394,7 +398,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 394 |
|
| 395 |
else:
|
| 396 |
# Transformers vision models are called with pixel_values and return ModelOutput.
|
| 397 |
-
# transformers vision 모델은 pixel_values로 호출되며 ModelOutput을
|
| 398 |
outputs = self.backbone(
|
| 399 |
pixel_values=pixel_values,
|
| 400 |
output_attentions=output_attentions,
|
|
@@ -407,13 +411,13 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 407 |
attentions = getattr(outputs, "attentions", None)
|
| 408 |
|
| 409 |
# Classifier consumes (B, feat_dim) and returns logits (B, num_labels).
|
| 410 |
-
# classifier는 (B, feat_dim)을 받아 logits (B, num_labels)를
|
| 411 |
logits = self.classifier(feats)
|
| 412 |
|
| 413 |
loss = None
|
| 414 |
if labels is not None:
|
| 415 |
# Cross entropy expects labels as class indices in [0, num_labels).
|
| 416 |
-
# cross entropy는 labels가 [0, num_labels) 범위의 class index이길
|
| 417 |
loss = F.cross_entropy(logits, labels)
|
| 418 |
|
| 419 |
if not return_dict:
|
|
@@ -434,14 +438,14 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 434 |
# ============================================================
|
| 435 |
def _set_requires_grad(module: nn.Module, flag: bool):
|
| 436 |
# Toggle requires_grad for all parameters in a module.
|
| 437 |
-
# 모듈의 모든 파라미터에 대해 requires_grad를
|
| 438 |
for p in module.parameters():
|
| 439 |
p.requires_grad = flag
|
| 440 |
|
| 441 |
|
| 442 |
def set_bn_eval(module: nn.Module):
|
| 443 |
# Put BatchNorm layers into eval mode to freeze running stats.
|
| 444 |
-
# BatchNorm 레이어를 eval 모드로 두어 running stats를
|
| 445 |
for m in module.modules():
|
| 446 |
if isinstance(m, (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.SyncBatchNorm)):
|
| 447 |
m.eval()
|
|
@@ -449,7 +453,7 @@ def set_bn_eval(module: nn.Module):
|
|
| 449 |
|
| 450 |
def freeze_backbone(model: BackboneWithMLPHeadForImageClassification, freeze_bn: bool = True):
|
| 451 |
# Stage1: freeze backbone and train only the head.
|
| 452 |
-
# stage1: backbone을 freeze하고 head만
|
| 453 |
_set_requires_grad(model.backbone, False)
|
| 454 |
_set_requires_grad(model.classifier, True)
|
| 455 |
|
|
@@ -460,7 +464,7 @@ def freeze_backbone(model: BackboneWithMLPHeadForImageClassification, freeze_bn:
|
|
| 460 |
|
| 461 |
def finetune_train_mode(model: BackboneWithMLPHeadForImageClassification, keep_bn_eval: bool = True):
|
| 462 |
# Stage2: train mode, optionally keeping BN layers in eval for stability.
|
| 463 |
-
# stage2: train 모드로 두되 안정성을 위해 BN을 eval로 유지할 수
|
| 464 |
model.train()
|
| 465 |
meta = getattr(model, "_meta", None) or getattr(model.config, "backbone_meta", None)
|
| 466 |
if keep_bn_eval and meta.get("has_bn", False):
|
|
@@ -469,7 +473,7 @@ def finetune_train_mode(model: BackboneWithMLPHeadForImageClassification, keep_b
|
|
| 469 |
|
| 470 |
def trainable_summary(model: nn.Module):
|
| 471 |
# Print a compact summary of trainable parameters.
|
| 472 |
-
# 학습 가능 파라미터 요약을 간단히
|
| 473 |
total = sum(p.numel() for p in model.parameters())
|
| 474 |
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
| 475 |
ratio = trainable / total if total > 0 else 0.0
|
|
@@ -483,7 +487,7 @@ def unfreeze_last_stage(
|
|
| 483 |
keep_bn_eval: bool = True,
|
| 484 |
):
|
| 485 |
# This utility implements BACKBONE_META['unfreeze']=="last_n" across supported backbones.
|
| 486 |
-
# 이 유틸은 지원 백본들에 대해 BACKBONE_META['unfreeze']=="last_n"을
|
| 487 |
freeze_backbone(model, freeze_bn=keep_bn_eval)
|
| 488 |
|
| 489 |
n = int(last_n)
|
|
@@ -498,7 +502,7 @@ def unfreeze_last_stage(
|
|
| 498 |
|
| 499 |
if bb_type == "vit":
|
| 500 |
# ViT blocks live under backbone.encoder.layer in the transformers implementation.
|
| 501 |
-
# ViT 블록은 transformers 구현에서 backbone.encoder.layer 아래에
|
| 502 |
blocks = list(model.backbone.encoder.layer)
|
| 503 |
for blk in blocks[-n:]:
|
| 504 |
_set_requires_grad(blk, True)
|
|
@@ -506,7 +510,7 @@ def unfreeze_last_stage(
|
|
| 506 |
|
| 507 |
if bb_type == "swin":
|
| 508 |
# Swin blocks are nested by stages and blocks; we flatten and unfreeze last n blocks.
|
| 509 |
-
# Swin 블록은 stage와 block으로 중첩되어 있어 펼친 후 마지막 n개를 unfreeze
|
| 510 |
stages = list(model.backbone.encoder.layers)
|
| 511 |
blocks: List[nn.Module] = []
|
| 512 |
for st in stages:
|
|
@@ -517,7 +521,7 @@ def unfreeze_last_stage(
|
|
| 517 |
|
| 518 |
if bb_type == "resnet":
|
| 519 |
# ResNet uses layer1..layer4 stages; we unfreeze at block granularity.
|
| 520 |
-
# ResNet은 layer1..layer4 stage를 사용하며 block 단위로 unfreeze
|
| 521 |
bb = model.backbone
|
| 522 |
for name in ("layer1", "layer2", "layer3", "layer4"):
|
| 523 |
if not hasattr(bb, name):
|
|
@@ -538,7 +542,7 @@ def unfreeze_last_stage(
|
|
| 538 |
|
| 539 |
if bb_type == "efficientnet":
|
| 540 |
# EfficientNet in transformers exposes features; we unfreeze from the tail blocks.
|
| 541 |
-
# transformers EfficientNet은 features를 노출하며 뒤쪽 블록부터 unfreeze
|
| 542 |
bb = model.backbone
|
| 543 |
if not hasattr(bb, "features"):
|
| 544 |
raise RuntimeError("Unexpected EfficientNet structure: missing features")
|
|
@@ -556,7 +560,7 @@ def unfreeze_last_stage(
|
|
| 556 |
|
| 557 |
if bb_type in ("timm_densenet", "torchvision_densenet"):
|
| 558 |
# DenseNet exposes a .features module with named blocks; we unfreeze last n submodules.
|
| 559 |
-
# DenseNet은 .features 모듈에 블록들이 이름으로 존재하며 마지막 n개 서브모듈을 unfreeze
|
| 560 |
bb = model.backbone
|
| 561 |
if not hasattr(bb, "features"):
|
| 562 |
raise RuntimeError("Unexpected DenseNet: missing features")
|
|
@@ -575,7 +579,7 @@ def unfreeze_last_stage(
|
|
| 575 |
|
| 576 |
def _denselayers(db: nn.Module) -> List[nn.Module]:
|
| 577 |
# Dense blocks contain multiple DenseLayer children; we return them for fine-grained unfreezing.
|
| 578 |
-
# denseblock은 DenseLayer 자식들을 가지므로 세밀한 unfreeze를 위해 이를
|
| 579 |
return list(db.children())
|
| 580 |
|
| 581 |
blocks: List[nn.Module] = []
|
|
@@ -600,5 +604,5 @@ def unfreeze_last_stage(
|
|
| 600 |
# register
|
| 601 |
# -------------------------
|
| 602 |
# Register for AutoModelForImageClassification so from_pretrained can resolve this custom class.
|
| 603 |
-
# from_pretrained가 이 커스텀 클래스를 해석할 수 있도록 AutoModelForImageClassification에
|
| 604 |
BackboneWithMLPHeadForImageClassification.register_for_auto_class("AutoModelForImageClassification")
|
|
|
|
| 94 |
|
| 95 |
def __init__(self, config: BackboneMLPHeadConfig):
|
| 96 |
# PreTrainedModel expects a config object and stores it internally.
|
| 97 |
+
# PreTrainedModel은 config 객체를 받아 내부에 저장함.
|
| 98 |
super().__init__(config)
|
| 99 |
|
| 100 |
# Fail-fast: the model is not meant to be instantiated without a valid backbone id.
|
| 101 |
+
# fail-fast: 유효한 backbone id 없이 모델을 만드는 사용 시나리오는 허용하지 않음 - fast fail.
|
| 102 |
#
|
| 103 |
# Note: Transformers may create configs with no args, but models are conventionally created with configs.
|
| 104 |
+
# 참고: Transformers는 config 무인자 생성이 있을 수 있으나, 모델은 관례적으로 config를 받아 생성.
|
| 105 |
if config.backbone_name_or_path is None:
|
| 106 |
raise ValueError(
|
| 107 |
"config.backbone_name_or_path is None. "
|
|
|
|
| 109 |
)
|
| 110 |
|
| 111 |
# Fail-fast: training/inference requires a positive number of labels.
|
| 112 |
+
# fail-fast: 학습/추론은 num_labels가 양수여야 함.
|
| 113 |
#
|
| 114 |
# Config may exist in a minimal form for internal serialization paths, but the model should not.
|
| 115 |
+
# config는 내부 직렬화 경로에서 최소 형태로 존재할 수 있으나 모델은 해당 없음.
|
| 116 |
if int(getattr(config, "num_labels", 0)) <= 0:
|
| 117 |
raise ValueError(
|
| 118 |
f"config.num_labels must be > 0, got {getattr(config, 'num_labels', None)}. "
|
|
|
|
| 120 |
)
|
| 121 |
|
| 122 |
# Meta is a single source of truth for extraction and fine-tuning rules.
|
| 123 |
+
# meta는 feature 추출 및 미세조정 규칙의 단일 기준.
|
| 124 |
+
# Resolve backbone meta from config (preferred) or fallback table (for backward compatibility).
|
| 125 |
# Prefer config.backbone_meta to keep Hub runtime self-contained.
|
| 126 |
self._meta = _resolve_backbone_meta(config, fallback_table=BACKBONE_META)
|
| 127 |
|
| 128 |
# Backbone skeleton is always created without pretrained weights.
|
| 129 |
+
# backbone skeleton은 항상 pretrained weight 없이 생성.
|
| 130 |
self.backbone = self._build_backbone_skeleton(config.backbone_name_or_path)
|
| 131 |
|
| 132 |
# Head shape is driven by meta feat_dim and config.num_labels.
|
| 133 |
+
# head shape은 meta의 feat_dim과 config.num_labels로 결정.
|
| 134 |
self.classifier = MLPHead(
|
| 135 |
in_dim=int(self._meta["feat_dim"]),
|
| 136 |
num_labels=int(config.num_labels),
|
|
|
|
| 139 |
)
|
| 140 |
|
| 141 |
# HF initialization hook, but we override init_weights to initialize head-only.
|
| 142 |
+
# HF 초기화 훅이지만 init_weights를 override하여 head만 초기화하도록 변경.
|
| 143 |
self.post_init()
|
| 144 |
|
| 145 |
def init_weights(self):
|
| 146 |
"""
|
| 147 |
Initialize only the head to avoid touching the backbone skeleton.
|
| 148 |
+
backbone skeleton을 건드리지 않기 위해 head만 초기화.
|
| 149 |
|
| 150 |
HF's default init may traverse the entire module tree, which is undesirable here.
|
| 151 |
+
HF 기본 init은 전체 모듈 트리를 순회할 수 있어 여기서 그대로 사용하기 부적절.
|
| 152 |
+
|
| 153 |
+
초기 설계에서 __init__ 내부에서 backbone의 가중치 로드를 수행함(편리를 위해).
|
| 154 |
+
이 경우, HF의 post_init()으로 인해 해당 로드가 취소되는 경우가 존재(timm, torchvision 등의 백본).
|
| 155 |
+
때문에 이를 오버라이드 하여 classifier만 초기화 하도록 변경함.
|
| 156 |
"""
|
| 157 |
if getattr(self, "classifier", None) is not None:
|
| 158 |
self.classifier.apply(self._init_weights)
|
|
|
|
| 164 |
# ----------------------------
|
| 165 |
def _build_backbone_skeleton(self, backbone_id: str) -> nn.Module:
|
| 166 |
# Meta decides which loader path to use.
|
| 167 |
+
# meta가 어떤 로더 경로를 사용할지 결정.
|
| 168 |
meta = self._meta if backbone_id == self.config.backbone_name_or_path else BACKBONE_META.get(backbone_id)
|
| 169 |
if meta is None:
|
| 170 |
raise KeyError(f"Unknown backbone_id={backbone_id}. Provide backbone_meta in config or extend BACKBONE_META.")
|
|
|
|
| 178 |
return self._build_torchvision_densenet_skeleton(backbone_id)
|
| 179 |
|
| 180 |
# For transformers backbones: build a random-weight skeleton from config only.
|
| 181 |
+
# transformers 백본: config로부터 랜덤 초기화 skeleton만 생성.
|
| 182 |
bb_cfg = AutoConfig.from_pretrained(backbone_id)
|
| 183 |
return AutoModel.from_config(bb_cfg)
|
| 184 |
|
| 185 |
@staticmethod
|
| 186 |
def _build_timm_densenet_skeleton(hf_repo_id: str) -> nn.Module:
|
| 187 |
# timm is an optional dependency and should be imported lazily.
|
| 188 |
+
# timm은 옵션 의존성이므로 지연 import 수행.
|
| 189 |
try:
|
| 190 |
import timm
|
| 191 |
except Exception as e:
|
|
|
|
| 194 |
) from e
|
| 195 |
|
| 196 |
# Build structure only (pretrained=False) and remove classifier head (num_classes=0).
|
| 197 |
+
# 구조만 생성(pretrained=False)하고 분류기 head는 제거(num_classes=0).
|
| 198 |
return timm.create_model(
|
| 199 |
f"hf_hub:{hf_repo_id}",
|
| 200 |
pretrained=False,
|
|
|
|
| 204 |
@staticmethod
|
| 205 |
def _build_torchvision_densenet_skeleton(model_id: str) -> nn.Module:
|
| 206 |
# This project intentionally supports only torchvision/densenet121 in the 224 whitelist.
|
| 207 |
+
# 이 프로젝트는 224 화이트리스트에서 torchvision/densenet121만 의도적으로 지원.
|
| 208 |
if model_id != "torchvision/densenet121":
|
| 209 |
raise ValueError(f"Unsupported torchvision DenseNet id (224 whitelist only): {model_id}")
|
| 210 |
|
| 211 |
# Build structure only (weights=None) to avoid implicit pretrained loading.
|
| 212 |
+
# implicit pretrained 로드를 피하기 위해 구조만 생성(weights=None).
|
| 213 |
m = tv_models.densenet121(weights=None)
|
| 214 |
return m
|
| 215 |
|
|
|
|
| 226 |
):
|
| 227 |
"""
|
| 228 |
Fresh-start only: inject pretrained backbone weights into the skeleton.
|
| 229 |
+
fresh-start 전용: skeleton backbone에 pretrained 가중치를 주입.
|
| 230 |
|
| 231 |
Do NOT call this after from_pretrained() because it would overwrite checkpoint weights.
|
| 232 |
+
from_pretrained() 이후 호출하면 체크포인트 가중치를 덮어쓰므로 주의할 것.
|
| 233 |
"""
|
| 234 |
bb = self.config.backbone_name_or_path
|
| 235 |
meta = self._meta
|
|
|
|
| 244 |
return
|
| 245 |
|
| 246 |
# For transformers backbones, load a reference pretrained model and copy weights into our skeleton.
|
| 247 |
+
# transformers 백본은 reference pretrained 모델을 로드한 뒤 skeleton에 가중치를 복사.
|
| 248 |
ref = AutoModel.from_pretrained(
|
| 249 |
bb,
|
| 250 |
low_cpu_mem_usage=low_cpu_mem_usage,
|
|
|
|
| 252 |
)
|
| 253 |
|
| 254 |
# strict=False is used to tolerate harmless key differences across minor versions.
|
| 255 |
+
# strict=False는 마이너 버전 차이로 인한 무해한 키 차이를 허용하기 위해 사용.
|
| 256 |
self.backbone.load_state_dict(ref.state_dict(), strict=False)
|
| 257 |
del ref
|
| 258 |
|
| 259 |
@torch.no_grad()
|
| 260 |
def _load_timm_pretrained_into_skeleton_(self, hf_repo_id: str):
|
| 261 |
# timm must be present for timm backbones.
|
| 262 |
+
# timm 백본에��� timm 설치가 필요.
|
| 263 |
import timm
|
| 264 |
|
| 265 |
# Create a pretrained reference model and copy its weights strictly.
|
| 266 |
+
# pretrained reference 모델을 만들고 가중치를 strict하게 복사.
|
| 267 |
ref = timm.create_model(
|
| 268 |
f"hf_hub:{hf_repo_id}",
|
| 269 |
pretrained=True,
|
|
|
|
| 276 |
@torch.no_grad()
|
| 277 |
def _load_torchvision_pretrained_into_skeleton_(self, model_id: str):
|
| 278 |
# This project intentionally supports only torchvision/densenet121 in the 224 whitelist.
|
| 279 |
+
# 이 프로젝트는 224 화이트리스트에서 torchvision/densenet121만 지원.
|
| 280 |
if model_id != "torchvision/densenet121":
|
| 281 |
raise ValueError(f"Unsupported torchvision DenseNet id (224 whitelist only): {model_id}")
|
| 282 |
|
| 283 |
# Use torchvision's default pretrained weights for densenet121.
|
| 284 |
+
# torchvision의 densenet121 기본 pretrained weights를 사용.
|
| 285 |
ref = tv_models.densenet121(weights=tv_models.DenseNet121_Weights.DEFAULT).eval()
|
| 286 |
|
| 287 |
self.backbone.load_state_dict(ref.state_dict(), strict=True)
|
|
|
|
| 294 |
@staticmethod
|
| 295 |
def _pool_or_gap(outputs) -> torch.Tensor:
|
| 296 |
# Some transformers vision CNNs provide pooler_output explicitly.
|
| 297 |
+
# 일부 transformers vision CNN은 pooler_output을 명시적으로 제공.
|
| 298 |
if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
|
| 299 |
x = outputs.pooler_output
|
| 300 |
if x.dim() == 2:
|
|
|
|
| 304 |
raise RuntimeError(f"Unexpected pooler_output shape: {tuple(x.shape)}")
|
| 305 |
|
| 306 |
# Otherwise we expect a CNN-style last_hidden_state=(B,C,H,W) and apply GAP.
|
| 307 |
+
# 그렇지 않으면 CNN 스타일 last_hidden_state=(B,C,H,W)를 기대하고 GAP을 적용.
|
| 308 |
x = outputs.last_hidden_state
|
| 309 |
if x.dim() == 4:
|
| 310 |
return x.mean(dim=(2, 3))
|
|
|
|
| 316 |
|
| 317 |
def _extract_features(self, outputs, pixel_values: Optional[torch.Tensor] = None) -> torch.Tensor:
|
| 318 |
# Feature rule is defined by BACKBONE_META and must remain stable across saves/loads.
|
| 319 |
+
# feature 규칙은 BACKBONE_META로 정의되며 저장/로드 간 안정적 동작을 위해 제한된 모델만 사용.
|
| 320 |
rule = self._meta["feat_rule"]
|
| 321 |
|
| 322 |
if rule == "cls":
|
| 323 |
# ViT-style: use CLS token embedding from last_hidden_state.
|
| 324 |
+
# ViT 스타일: last_hidden_state에서 CLS 토큰 임베딩을 사용.
|
| 325 |
return outputs.last_hidden_state[:, 0, :]
|
| 326 |
|
| 327 |
if rule == "pool_or_mean":
|
| 328 |
# Swin-style: prefer pooler_output if present, else mean-pool over tokens.
|
| 329 |
+
# Swin 스타일: pooler_output이 있으면 우선 사용하고, 없으면 토큰 평균 풀링을 사용.
|
| 330 |
if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
|
| 331 |
return outputs.pooler_output
|
| 332 |
return outputs.last_hidden_state.mean(dim=1)
|
| 333 |
|
| 334 |
if rule == "pool_or_gap":
|
| 335 |
# CNN-style: use pooler_output if present, else GAP over spatial dims.
|
| 336 |
+
# CNN 스타일: pooler_output이 있으면 사용하고, 없으면 공간 차원 GAP을 사용.
|
| 337 |
return self._pool_or_gap(outputs)
|
| 338 |
|
| 339 |
if rule == "timm_gap":
|
| 340 |
# timm forward_features returns a feature map (B,C,H,W) which we GAP to (B,C).
|
| 341 |
+
# timm forward_features는 (B,C,H,W) feature map을 반환하며 이를 GAP으로 (B,C)로 변환.
|
| 342 |
if not isinstance(outputs, torch.Tensor):
|
| 343 |
raise TypeError(f"timm_gap expects Tensor features, got {type(outputs)}")
|
| 344 |
if outputs.dim() != 4:
|
|
|
|
| 347 |
|
| 348 |
if rule == "torchvision_densenet_gap":
|
| 349 |
# torchvision DenseNet features are feature maps (B,C,H,W) and require GAP.
|
| 350 |
+
# torchvision DenseNet features는 (B,C,H,W) feature map이며 GAP이 필요.
|
| 351 |
if not isinstance(outputs, torch.Tensor):
|
| 352 |
raise TypeError(f"torchvision_densenet_gap expects Tensor, got {type(outputs)}")
|
| 353 |
if outputs.dim() != 4:
|
|
|
|
| 366 |
**kwargs,
|
| 367 |
):
|
| 368 |
# Type decides the backbone forward path and output format.
|
| 369 |
+
# type이 backbone forward 경로 및 출력 포맷을 결정.
|
| 370 |
t = self._meta["type"]
|
| 371 |
|
| 372 |
if t == "timm_densenet":
|
|
|
|
| 398 |
|
| 399 |
else:
|
| 400 |
# Transformers vision models are called with pixel_values and return ModelOutput.
|
| 401 |
+
# transformers vision 모델은 pixel_values로 호출되며 ModelOutput을 반환.
|
| 402 |
outputs = self.backbone(
|
| 403 |
pixel_values=pixel_values,
|
| 404 |
output_attentions=output_attentions,
|
|
|
|
| 411 |
attentions = getattr(outputs, "attentions", None)
|
| 412 |
|
| 413 |
# Classifier consumes (B, feat_dim) and returns logits (B, num_labels).
|
| 414 |
+
# classifier는 (B, feat_dim)을 받아 logits (B, num_labels)를 반환.
|
| 415 |
logits = self.classifier(feats)
|
| 416 |
|
| 417 |
loss = None
|
| 418 |
if labels is not None:
|
| 419 |
# Cross entropy expects labels as class indices in [0, num_labels).
|
| 420 |
+
# cross entropy는 labels가 [0, num_labels) 범위의 class index이길 기대함.
|
| 421 |
loss = F.cross_entropy(logits, labels)
|
| 422 |
|
| 423 |
if not return_dict:
|
|
|
|
| 438 |
# ============================================================
|
| 439 |
def _set_requires_grad(module: nn.Module, flag: bool):
|
| 440 |
# Toggle requires_grad for all parameters in a module.
|
| 441 |
+
# 모듈의 모든 파라미터에 대해 requires_grad를 토글.
|
| 442 |
for p in module.parameters():
|
| 443 |
p.requires_grad = flag
|
| 444 |
|
| 445 |
|
| 446 |
def set_bn_eval(module: nn.Module):
|
| 447 |
# Put BatchNorm layers into eval mode to freeze running stats.
|
| 448 |
+
# BatchNorm 레이어를 eval 모드로 두어 running stats를 고정.
|
| 449 |
for m in module.modules():
|
| 450 |
if isinstance(m, (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.SyncBatchNorm)):
|
| 451 |
m.eval()
|
|
|
|
| 453 |
|
| 454 |
def freeze_backbone(model: BackboneWithMLPHeadForImageClassification, freeze_bn: bool = True):
|
| 455 |
# Stage1: freeze backbone and train only the head.
|
| 456 |
+
# stage1: backbone을 freeze하고 head만 학습.
|
| 457 |
_set_requires_grad(model.backbone, False)
|
| 458 |
_set_requires_grad(model.classifier, True)
|
| 459 |
|
|
|
|
| 464 |
|
| 465 |
def finetune_train_mode(model: BackboneWithMLPHeadForImageClassification, keep_bn_eval: bool = True):
|
| 466 |
# Stage2: train mode, optionally keeping BN layers in eval for stability.
|
| 467 |
+
# stage2: train 모드로 두되 안정성을 위해 BN을 eval로 유지할 수 있음. (buffer 등을 유지하기 위해)
|
| 468 |
model.train()
|
| 469 |
meta = getattr(model, "_meta", None) or getattr(model.config, "backbone_meta", None)
|
| 470 |
if keep_bn_eval and meta.get("has_bn", False):
|
|
|
|
| 473 |
|
| 474 |
def trainable_summary(model: nn.Module):
|
| 475 |
# Print a compact summary of trainable parameters.
|
| 476 |
+
# 학습 가능 파라미터 요약을 간단히 출력.
|
| 477 |
total = sum(p.numel() for p in model.parameters())
|
| 478 |
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
| 479 |
ratio = trainable / total if total > 0 else 0.0
|
|
|
|
| 487 |
keep_bn_eval: bool = True,
|
| 488 |
):
|
| 489 |
# This utility implements BACKBONE_META['unfreeze']=="last_n" across supported backbones.
|
| 490 |
+
# 이 유틸은 지원 백본들에 대해 BACKBONE_META['unfreeze']=="last_n"을 구현.
|
| 491 |
freeze_backbone(model, freeze_bn=keep_bn_eval)
|
| 492 |
|
| 493 |
n = int(last_n)
|
|
|
|
| 502 |
|
| 503 |
if bb_type == "vit":
|
| 504 |
# ViT blocks live under backbone.encoder.layer in the transformers implementation.
|
| 505 |
+
# ViT 블록은 transformers 구현에서 backbone.encoder.layer 아래에 존재함.
|
| 506 |
blocks = list(model.backbone.encoder.layer)
|
| 507 |
for blk in blocks[-n:]:
|
| 508 |
_set_requires_grad(blk, True)
|
|
|
|
| 510 |
|
| 511 |
if bb_type == "swin":
|
| 512 |
# Swin blocks are nested by stages and blocks; we flatten and unfreeze last n blocks.
|
| 513 |
+
# Swin 블록은 stage와 block으로 중첩되어 있어 펼친 후 마지막 n개를 unfreeze.
|
| 514 |
stages = list(model.backbone.encoder.layers)
|
| 515 |
blocks: List[nn.Module] = []
|
| 516 |
for st in stages:
|
|
|
|
| 521 |
|
| 522 |
if bb_type == "resnet":
|
| 523 |
# ResNet uses layer1..layer4 stages; we unfreeze at block granularity.
|
| 524 |
+
# ResNet은 layer1..layer4 stage를 사용하며 block 단위로 unfreeze.
|
| 525 |
bb = model.backbone
|
| 526 |
for name in ("layer1", "layer2", "layer3", "layer4"):
|
| 527 |
if not hasattr(bb, name):
|
|
|
|
| 542 |
|
| 543 |
if bb_type == "efficientnet":
|
| 544 |
# EfficientNet in transformers exposes features; we unfreeze from the tail blocks.
|
| 545 |
+
# transformers EfficientNet은 features를 노출하며 뒤쪽 블록부터 unfreeze.
|
| 546 |
bb = model.backbone
|
| 547 |
if not hasattr(bb, "features"):
|
| 548 |
raise RuntimeError("Unexpected EfficientNet structure: missing features")
|
|
|
|
| 560 |
|
| 561 |
if bb_type in ("timm_densenet", "torchvision_densenet"):
|
| 562 |
# DenseNet exposes a .features module with named blocks; we unfreeze last n submodules.
|
| 563 |
+
# DenseNet은 .features 모듈에 블록들이 이름으로 존재하며 마지막 n개 서브모듈을 unfreeze.
|
| 564 |
bb = model.backbone
|
| 565 |
if not hasattr(bb, "features"):
|
| 566 |
raise RuntimeError("Unexpected DenseNet: missing features")
|
|
|
|
| 579 |
|
| 580 |
def _denselayers(db: nn.Module) -> List[nn.Module]:
|
| 581 |
# Dense blocks contain multiple DenseLayer children; we return them for fine-grained unfreezing.
|
| 582 |
+
# denseblock은 DenseLayer 자식들을 가지므로 세밀한 unfreeze를 위해 이를 반환.
|
| 583 |
return list(db.children())
|
| 584 |
|
| 585 |
blocks: List[nn.Module] = []
|
|
|
|
| 604 |
# register
|
| 605 |
# -------------------------
|
| 606 |
# Register for AutoModelForImageClassification so from_pretrained can resolve this custom class.
|
| 607 |
+
# from_pretrained가 이 커스텀 클래스를 해석할 수 있도록 AutoModelForImageClassification에 등록.
|
| 608 |
BackboneWithMLPHeadForImageClassification.register_for_auto_class("AutoModelForImageClassification")
|
models/microsoft__resnet-50/ds_proc.py
CHANGED
|
@@ -4,8 +4,8 @@
|
|
| 4 |
# src/ds_proc.py
|
| 5 |
|
| 6 |
# ============================================================
|
| 7 |
-
#
|
| 8 |
-
#
|
| 9 |
# ============================================================
|
| 10 |
|
| 11 |
from typing import Any
|
|
@@ -27,41 +27,38 @@ except ImportError:
|
|
| 27 |
class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
| 28 |
"""
|
| 29 |
This processor performs image preprocessing and outputs {"pixel_values": ...}.
|
| 30 |
-
이 processor는 이미지 전처리를 수행하고 {"pixel_values": ...}를
|
| 31 |
|
| 32 |
Key requirements:
|
| 33 |
핵심 요구사항:
|
| 34 |
|
| 35 |
1) save_pretrained() must produce a JSON-serializable preprocessor_config.json.
|
| 36 |
-
save_pretrained()는 JSON 직렬화 가능한 preprocessor_config.json을 생성해야
|
| 37 |
-
|
| 38 |
2) Runtime-only objects (delegate processor, timm/torchvision transforms) must NOT be serialized.
|
| 39 |
-
런타임 객체(delegate processor, timm/torchvision transform)는 절대 직렬화하면 안
|
| 40 |
-
|
| 41 |
3) Runtime objects are rebuilt at init/load time based on backbone meta.
|
| 42 |
-
런타임 객체는 backbone meta에 따라 init/load 시점에
|
| 43 |
-
|
| 44 |
4) For reproducibility, use_fast must be explicitly persisted and honored on load.
|
| 45 |
-
재현성을 위해 use_fast는 명시적으로 저장되고, 로드시 반드시 반영되어야
|
| 46 |
"""
|
| 47 |
|
| 48 |
# HF vision models conventionally expect "pixel_values" as the primary input key.
|
| 49 |
-
# HF vision 모델은 관례적으로 입력 키로 "pixel_values"를
|
| 50 |
model_input_names = ["pixel_values"]
|
| 51 |
|
| 52 |
def __init__(
|
| 53 |
self,
|
| 54 |
backbone_name_or_path: BackboneID,
|
| 55 |
-
is_training: bool = False,
|
| 56 |
use_fast: bool = False,
|
| 57 |
**kwargs,
|
| 58 |
):
|
| 59 |
# ImageProcessingMixin stores extra kwargs and manages auto_map metadata.
|
| 60 |
-
# ImageProcessingMixin은 추가 kwargs를 저장하고 auto_map 메타를
|
| 61 |
super().__init__(**kwargs)
|
| 62 |
|
| 63 |
# Enforce whitelist via BACKBONE_META to keep behavior stable.
|
| 64 |
-
# 동작 안정성을 위해 BACKBONE_META 기반 화이트리스트를
|
| 65 |
if backbone_name_or_path not in BACKBONE_META:
|
| 66 |
raise ValueError(
|
| 67 |
f"Unsupported backbone_name_or_path={backbone_name_or_path}. "
|
|
@@ -69,23 +66,23 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 69 |
)
|
| 70 |
|
| 71 |
# Serializable fields only: these should appear in preprocessor_config.json.
|
| 72 |
-
# 직렬화 가능한 필드만: 이 값들만 preprocessor_config.json에 들어가야
|
| 73 |
self.backbone_name_or_path = backbone_name_or_path
|
| 74 |
self.is_training = bool(is_training)
|
| 75 |
|
| 76 |
# Reproducibility switch for transformers processors.
|
| 77 |
-
# transformers processor의 fast/slow 선택을 재현 가능하게
|
| 78 |
self.use_fast = bool(use_fast)
|
| 79 |
|
| 80 |
# Runtime-only fields: must never be serialized.
|
| 81 |
-
# 런타임 전용 필드: 절대 직렬화되면 안
|
| 82 |
self._meta = None
|
| 83 |
-
self._delegate
|
| 84 |
-
self._timm_transform
|
| 85 |
self._torchvision_transform = None
|
| 86 |
|
| 87 |
# Build runtime objects according to backbone type.
|
| 88 |
-
# backbone type에 따라 런타임 객체를
|
| 89 |
self._build_runtime()
|
| 90 |
|
| 91 |
# ============================================================
|
|
@@ -95,13 +92,13 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 95 |
def _build_runtime(self):
|
| 96 |
"""
|
| 97 |
Build runtime delegate/transform based on BACKBONE_META["type"].
|
| 98 |
-
BACKBONE_META["type"]에 따라 런타임 delegate/transform을
|
| 99 |
"""
|
| 100 |
meta = BACKBONE_META[self.backbone_name_or_path]
|
| 101 |
self._meta = meta
|
| 102 |
|
| 103 |
# Always reset runtime fields before rebuilding.
|
| 104 |
-
# 재구성 전 런타임 필드는 항상
|
| 105 |
self._delegate = None
|
| 106 |
self._timm_transform = None
|
| 107 |
self._torchvision_transform = None
|
|
@@ -110,7 +107,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 110 |
|
| 111 |
if t == "timm_densenet":
|
| 112 |
# timm DenseNet uses timm.data transforms for ImageNet-style preprocessing.
|
| 113 |
-
# timm DenseNet은 ImageNet 전처리를 위해 timm.data transform을
|
| 114 |
self._timm_transform = self._build_timm_transform(
|
| 115 |
backbone_id=self.backbone_name_or_path,
|
| 116 |
is_training=self.is_training,
|
|
@@ -119,17 +116,17 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 119 |
|
| 120 |
if t == "torchvision_densenet":
|
| 121 |
# torchvision DenseNet requires torchvision-style preprocessing (resize/crop/tensor/normalize).
|
| 122 |
-
# torchvision DenseNet은 torchvision 스타일 전처리(resize/crop/tensor/normalize)가
|
| 123 |
self._torchvision_transform = self._build_torchvision_densenet_transform(
|
| 124 |
is_training=self.is_training
|
| 125 |
)
|
| 126 |
return
|
| 127 |
|
| 128 |
# Default: transformers backbone delegates to its official AutoImageProcessor.
|
| 129 |
-
# 기본: transformers 백본은 공식 AutoImageProcessor에
|
| 130 |
#
|
| 131 |
# IMPORTANT:
|
| 132 |
-
# - use_fast는 transformers 기본값 변경에 흔들리지 않도록 반드시 명시적으로
|
| 133 |
self._delegate = AutoImageProcessor.from_pretrained(
|
| 134 |
self.backbone_name_or_path,
|
| 135 |
use_fast=self.use_fast,
|
|
@@ -140,7 +137,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 140 |
def _build_timm_transform(*, backbone_id: str, is_training: bool):
|
| 141 |
"""
|
| 142 |
Create timm transform without storing non-serializable objects in config.
|
| 143 |
-
비직렬화 객체를 config에 저장하지 않고 timm transform을
|
| 144 |
"""
|
| 145 |
try:
|
| 146 |
import timm
|
|
@@ -151,20 +148,20 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 151 |
) from e
|
| 152 |
|
| 153 |
# We only need model metadata to resolve data config, so pretrained=False is preferred.
|
| 154 |
-
# data config 추출만 필요하므로 pretrained=False를 우선
|
| 155 |
m = timm.create_model(f"hf_hub:{backbone_id}", pretrained=False, num_classes=0)
|
| 156 |
dc = resolve_model_data_config(m)
|
| 157 |
|
| 158 |
# create_transform returns a torchvision-like callable that maps PIL -> torch.Tensor(C,H,W).
|
| 159 |
-
# create_transform은 PIL -> torch.Tensor(C,H,W)로 매핑하는 callable을
|
| 160 |
-
tfm = create_transform(**dc, is_training=is_training)
|
| 161 |
return tfm
|
| 162 |
|
| 163 |
@staticmethod
|
| 164 |
def _build_torchvision_densenet_transform(*, is_training: bool):
|
| 165 |
"""
|
| 166 |
Build torchvision preprocessing for DenseNet-121 (224 pipeline).
|
| 167 |
-
DenseNet-121용 torchvision 전처리(224 파이프라인)를
|
| 168 |
"""
|
| 169 |
try:
|
| 170 |
from torchvision import transforms
|
|
@@ -174,28 +171,29 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 174 |
) from e
|
| 175 |
|
| 176 |
# These are the standard ImageNet normalization stats used by torchvision weights.
|
| 177 |
-
# 이 값들은 torchvision weights가 사용하는 표준 ImageNet 정규화
|
| 178 |
mean = (0.485, 0.456, 0.406)
|
| 179 |
-
std
|
| 180 |
|
| 181 |
# Training pipeline typically uses RandomResizedCrop and horizontal flip.
|
| 182 |
-
# 학습 파이프라인은 보통 RandomResizedCrop과 좌우반전을
|
| 183 |
if is_training:
|
| 184 |
return transforms.Compose(
|
| 185 |
[
|
| 186 |
-
transforms.RandomResizedCrop(224),
|
| 187 |
-
transforms.RandomHorizontalFlip(p=0.5),
|
|
|
|
| 188 |
transforms.ToTensor(),
|
| 189 |
transforms.Normalize(mean=mean, std=std),
|
| 190 |
]
|
| 191 |
)
|
| 192 |
|
| 193 |
# Inference pipeline typically uses Resize(256) + CenterCrop(224).
|
| 194 |
-
# 추론 파이프라인은 보통 Resize(256) + CenterCrop(224)를
|
| 195 |
return transforms.Compose(
|
| 196 |
[
|
| 197 |
transforms.Resize(256),
|
| 198 |
-
transforms.CenterCrop(224),
|
| 199 |
transforms.ToTensor(),
|
| 200 |
transforms.Normalize(mean=mean, std=std),
|
| 201 |
]
|
|
@@ -208,24 +206,24 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 208 |
def to_dict(self) -> dict[str, Any]:
|
| 209 |
"""
|
| 210 |
Return a JSON-serializable dict for preprocessor_config.json.
|
| 211 |
-
preprocessor_config.json에 들어갈 JSON 직렬화 dict를
|
| 212 |
|
| 213 |
Important: do not leak runtime objects into the serialized dict.
|
| 214 |
-
중요: 런타임 객체가 직렬화 dict에 섞이면 안
|
| 215 |
"""
|
| 216 |
# ImageProcessingMixin.to_dict() adds metadata such as image_processor_type/auto_map.
|
| 217 |
# ImageProcessingMixin.to_dict()는 image_processor_type/auto_map 같은 메타를 추가합니다.
|
| 218 |
d = super().to_dict()
|
| 219 |
|
| 220 |
# Force minimal stable fields for long-term compatibility.
|
| 221 |
-
# 장기 호환을 위해 최소 안정 필드를
|
| 222 |
-
d["image_processor_type"]
|
| 223 |
d["backbone_name_or_path"] = self.backbone_name_or_path
|
| 224 |
d["is_training"] = self.is_training
|
| 225 |
-
d["use_fast"]
|
| 226 |
|
| 227 |
# Remove any runtime-only fields defensively.
|
| 228 |
-
# 런타임 전용 필드는 보수적으로
|
| 229 |
for key in ["_meta", "_delegate", "_timm_transform", "_torchvision_transform"]:
|
| 230 |
d.pop(key, None)
|
| 231 |
|
|
@@ -235,14 +233,14 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 235 |
def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs):
|
| 236 |
"""
|
| 237 |
Standard load path used by BaseImageProcessor / AutoImageProcessor.
|
| 238 |
-
BaseImageProcessor / AutoImageProcessor가 사용하는 표준 로드
|
| 239 |
"""
|
| 240 |
backbone = image_processor_dict.get("backbone_name_or_path", None)
|
| 241 |
if backbone is None:
|
| 242 |
raise ValueError("preprocessor_config.json missing key: backbone_name_or_path")
|
| 243 |
|
| 244 |
is_training = bool(image_processor_dict.get("is_training", False))
|
| 245 |
-
use_fast
|
| 246 |
|
| 247 |
return cls(
|
| 248 |
backbone_name_or_path=backbone,
|
|
@@ -255,20 +253,20 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 255 |
def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
|
| 256 |
"""
|
| 257 |
Fallback path if AutoImageProcessor calls class.from_pretrained directly.
|
| 258 |
-
AutoImageProcessor가 class.from_pretrained를 직접 호출하는 경우를 대비한
|
| 259 |
|
| 260 |
Strategy:
|
| 261 |
전략:
|
| 262 |
|
| 263 |
- Read config.json via AutoConfig and recover backbone_name_or_path.
|
| 264 |
-
AutoConfig로 config.json을 읽고 backbone_name_or_path를
|
| 265 |
"""
|
| 266 |
|
| 267 |
# is_training is runtime-only and should default to False for inference/serving.
|
| 268 |
-
# is_training은 런타임 전용이며 추론/서빙 기본값은 False
|
| 269 |
#
|
| 270 |
# IMPORTANT:
|
| 271 |
-
# - use_fast는 kwargs로 전달될 수 있으므로, 있으면
|
| 272 |
use_fast = bool(kwargs.pop("use_fast", False))
|
| 273 |
|
| 274 |
kwargs.pop("trust_remote_code", None)
|
|
@@ -289,7 +287,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 289 |
@staticmethod
|
| 290 |
def _ensure_list(images: Any) -> list[Any]:
|
| 291 |
# Normalize scalar image input to a list for uniform processing.
|
| 292 |
-
# 단일 입력을 리스트로 정규화하여 동일한 처리 경로를
|
| 293 |
if isinstance(images, (list, tuple)):
|
| 294 |
return list(images)
|
| 295 |
return [images]
|
|
@@ -297,7 +295,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 297 |
@staticmethod
|
| 298 |
def _to_pil_rgb(x: Any):
|
| 299 |
# Convert common image inputs into PIL RGB images.
|
| 300 |
-
# 일반적인 입력을 PIL RGB 이미지로
|
| 301 |
from PIL import Image as PILImage
|
| 302 |
|
| 303 |
if isinstance(x, PILImage.Image):
|
|
@@ -314,17 +312,17 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 314 |
) -> dict[str, Any]:
|
| 315 |
"""
|
| 316 |
Convert images into {"pixel_values": Tensor/ndarray}.
|
| 317 |
-
이미지를 {"pixel_values": Tensor/ndarray}로
|
| 318 |
"""
|
| 319 |
images = self._ensure_list(images)
|
| 320 |
|
| 321 |
# Rebuild runtime if needed (e.g., right after deserialization).
|
| 322 |
-
# 직렬화 복원 직후 등 런타임이 비어있을 수 있으므로
|
| 323 |
if (self._delegate is None) and (self._timm_transform is None) and (self._torchvision_transform is None):
|
| 324 |
self._build_runtime()
|
| 325 |
|
| 326 |
# timm path: PIL -> torch.Tensor(C,H,W) normalized float32.
|
| 327 |
-
# timm 경로: PIL -> torch.Tensor(C,H,W) 정규화 float32
|
| 328 |
if self._timm_transform is not None:
|
| 329 |
pv: list[torch.Tensor] = []
|
| 330 |
for im in images:
|
|
@@ -337,7 +335,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 337 |
return self._format_return(pixel_values, return_tensors)
|
| 338 |
|
| 339 |
# torchvision path: PIL -> torch.Tensor(C,H,W) normalized float32.
|
| 340 |
-
# torchvision 경로: PIL -> torch.Tensor(C,H,W) 정규화 float32
|
| 341 |
if self._torchvision_transform is not None:
|
| 342 |
pv: list[torch.Tensor] = []
|
| 343 |
for im in images:
|
|
@@ -350,7 +348,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 350 |
return self._format_return(pixel_values, return_tensors)
|
| 351 |
|
| 352 |
# transformers delegate path: rely on official processor behavior.
|
| 353 |
-
# transformers 위임 경로: 공식 processor 동작을 그대로
|
| 354 |
if self._delegate is None:
|
| 355 |
raise RuntimeError("Processor runtime not built: delegate is None and no transforms are available.")
|
| 356 |
|
|
@@ -360,7 +358,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 360 |
def _format_return(pixel_values: torch.Tensor, return_tensors: str | TensorType | None) -> dict[str, Any]:
|
| 361 |
"""
|
| 362 |
Format pixel_values according to return_tensors.
|
| 363 |
-
return_tensors에 맞춰 pixel_values 반환 포맷을
|
| 364 |
"""
|
| 365 |
if return_tensors is None or return_tensors in ("pt", TensorType.PYTORCH):
|
| 366 |
return {"pixel_values": pixel_values}
|
|
@@ -370,6 +368,6 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 370 |
|
| 371 |
|
| 372 |
# Register this processor for AutoImageProcessor resolution.
|
| 373 |
-
# AutoImageProcessor 해석을 위해 이 processor를
|
| 374 |
if __name__ != "__main__":
|
| 375 |
BackboneMLPHead224ImageProcessor.register_for_auto_class("AutoImageProcessor")
|
|
|
|
| 4 |
# src/ds_proc.py
|
| 5 |
|
| 6 |
# ============================================================
|
| 7 |
+
# ImageProcessor (AutoImageProcessor integration)
|
| 8 |
+
# ImageProcessor (AutoImageProcessor 연동)
|
| 9 |
# ============================================================
|
| 10 |
|
| 11 |
from typing import Any
|
|
|
|
| 27 |
class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
| 28 |
"""
|
| 29 |
This processor performs image preprocessing and outputs {"pixel_values": ...}.
|
| 30 |
+
이 processor는 이미지 전처리를 수행하고 {"pixel_values": ...}를 반환함.
|
| 31 |
|
| 32 |
Key requirements:
|
| 33 |
핵심 요구사항:
|
| 34 |
|
| 35 |
1) save_pretrained() must produce a JSON-serializable preprocessor_config.json.
|
| 36 |
+
save_pretrained()는 JSON 직렬화 가능한 preprocessor_config.json을 생성해야 함.
|
|
|
|
| 37 |
2) Runtime-only objects (delegate processor, timm/torchvision transforms) must NOT be serialized.
|
| 38 |
+
런타임 객체(delegate processor, timm/torchvision transform)는 절대 직렬화하면 안 됨.
|
|
|
|
| 39 |
3) Runtime objects are rebuilt at init/load time based on backbone meta.
|
| 40 |
+
런타임 객체는 backbone meta에 따라 init/load 시점에 재구성.
|
|
|
|
| 41 |
4) For reproducibility, use_fast must be explicitly persisted and honored on load.
|
| 42 |
+
재현성을 위해 use_fast는 명시적으로 저장되고, 로드시 반드시 반영되어야 함.
|
| 43 |
"""
|
| 44 |
|
| 45 |
# HF vision models conventionally expect "pixel_values" as the primary input key.
|
| 46 |
+
# HF vision 모델은 관례적으로 입력 키로 "pixel_values"를 기대.
|
| 47 |
model_input_names = ["pixel_values"]
|
| 48 |
|
| 49 |
def __init__(
|
| 50 |
self,
|
| 51 |
backbone_name_or_path: BackboneID,
|
| 52 |
+
is_training: bool = False, # timm 에서 data augmentation 용.
|
| 53 |
use_fast: bool = False,
|
| 54 |
**kwargs,
|
| 55 |
):
|
| 56 |
# ImageProcessingMixin stores extra kwargs and manages auto_map metadata.
|
| 57 |
+
# ImageProcessingMixin은 추가 kwargs를 저장하고 auto_map 메타를 관리.
|
| 58 |
super().__init__(**kwargs)
|
| 59 |
|
| 60 |
# Enforce whitelist via BACKBONE_META to keep behavior stable.
|
| 61 |
+
# 동작 안정성을 위해 BACKBONE_META 기반 화이트리스트를 강제. - fast fail
|
| 62 |
if backbone_name_or_path not in BACKBONE_META:
|
| 63 |
raise ValueError(
|
| 64 |
f"Unsupported backbone_name_or_path={backbone_name_or_path}. "
|
|
|
|
| 66 |
)
|
| 67 |
|
| 68 |
# Serializable fields only: these should appear in preprocessor_config.json.
|
| 69 |
+
# 직렬화 가능한 필드만: 이 값들만 preprocessor_config.json에 들어가야 함
|
| 70 |
self.backbone_name_or_path = backbone_name_or_path
|
| 71 |
self.is_training = bool(is_training)
|
| 72 |
|
| 73 |
# Reproducibility switch for transformers processors.
|
| 74 |
+
# transformers processor의 fast/slow 선택을 재현 가능하게 고정.
|
| 75 |
self.use_fast = bool(use_fast)
|
| 76 |
|
| 77 |
# Runtime-only fields: must never be serialized.
|
| 78 |
+
# 런타임 전용 필드: 절대 직렬화되면 안 됨.
|
| 79 |
self._meta = None
|
| 80 |
+
self._delegate = None
|
| 81 |
+
self._timm_transform = None
|
| 82 |
self._torchvision_transform = None
|
| 83 |
|
| 84 |
# Build runtime objects according to backbone type.
|
| 85 |
+
# backbone type에 따라 런타임 객체를 구성.
|
| 86 |
self._build_runtime()
|
| 87 |
|
| 88 |
# ============================================================
|
|
|
|
| 92 |
def _build_runtime(self):
|
| 93 |
"""
|
| 94 |
Build runtime delegate/transform based on BACKBONE_META["type"].
|
| 95 |
+
BACKBONE_META["type"]에 따라 런타임 delegate/transform을 구성.
|
| 96 |
"""
|
| 97 |
meta = BACKBONE_META[self.backbone_name_or_path]
|
| 98 |
self._meta = meta
|
| 99 |
|
| 100 |
# Always reset runtime fields before rebuilding.
|
| 101 |
+
# 재구성 전 런타임 필드는 항상 초기화.
|
| 102 |
self._delegate = None
|
| 103 |
self._timm_transform = None
|
| 104 |
self._torchvision_transform = None
|
|
|
|
| 107 |
|
| 108 |
if t == "timm_densenet":
|
| 109 |
# timm DenseNet uses timm.data transforms for ImageNet-style preprocessing.
|
| 110 |
+
# timm DenseNet은 ImageNet 전처리를 위해 timm.data transform을 사용.
|
| 111 |
self._timm_transform = self._build_timm_transform(
|
| 112 |
backbone_id=self.backbone_name_or_path,
|
| 113 |
is_training=self.is_training,
|
|
|
|
| 116 |
|
| 117 |
if t == "torchvision_densenet":
|
| 118 |
# torchvision DenseNet requires torchvision-style preprocessing (resize/crop/tensor/normalize).
|
| 119 |
+
# torchvision DenseNet은 torchvision 스타일 전처리(resize/crop/tensor/normalize)가 필요.
|
| 120 |
self._torchvision_transform = self._build_torchvision_densenet_transform(
|
| 121 |
is_training=self.is_training
|
| 122 |
)
|
| 123 |
return
|
| 124 |
|
| 125 |
# Default: transformers backbone delegates to its official AutoImageProcessor.
|
| 126 |
+
# 기본: transformers 백본은 공식 AutoImageProcessor에 위임.
|
| 127 |
#
|
| 128 |
# IMPORTANT:
|
| 129 |
+
# - use_fast는 transformers 기본값 변경에 흔들리지 않도록 반드시 명시적으로 전달.
|
| 130 |
self._delegate = AutoImageProcessor.from_pretrained(
|
| 131 |
self.backbone_name_or_path,
|
| 132 |
use_fast=self.use_fast,
|
|
|
|
| 137 |
def _build_timm_transform(*, backbone_id: str, is_training: bool):
|
| 138 |
"""
|
| 139 |
Create timm transform without storing non-serializable objects in config.
|
| 140 |
+
비직렬화 객체를 config에 저장하지 않고 timm transform을 생성.
|
| 141 |
"""
|
| 142 |
try:
|
| 143 |
import timm
|
|
|
|
| 148 |
) from e
|
| 149 |
|
| 150 |
# We only need model metadata to resolve data config, so pretrained=False is preferred.
|
| 151 |
+
# data config 추출만 필요하므로 pretrained=False를 우선 사용.
|
| 152 |
m = timm.create_model(f"hf_hub:{backbone_id}", pretrained=False, num_classes=0)
|
| 153 |
dc = resolve_model_data_config(m)
|
| 154 |
|
| 155 |
# create_transform returns a torchvision-like callable that maps PIL -> torch.Tensor(C,H,W).
|
| 156 |
+
# create_transform은 PIL -> torch.Tensor(C,H,W)로 매핑하는 callable을 반환.
|
| 157 |
+
tfm = create_transform(**dc, is_training=is_training) # is_training :Data Aug.
|
| 158 |
return tfm
|
| 159 |
|
| 160 |
@staticmethod
|
| 161 |
def _build_torchvision_densenet_transform(*, is_training: bool):
|
| 162 |
"""
|
| 163 |
Build torchvision preprocessing for DenseNet-121 (224 pipeline).
|
| 164 |
+
DenseNet-121용 torchvision 전처리(224 파이프라인)를 구성.
|
| 165 |
"""
|
| 166 |
try:
|
| 167 |
from torchvision import transforms
|
|
|
|
| 171 |
) from e
|
| 172 |
|
| 173 |
# These are the standard ImageNet normalization stats used by torchvision weights.
|
| 174 |
+
# 이 값들은 torchvision weights가 사용하는 표준 ImageNet 정규화 통계.
|
| 175 |
mean = (0.485, 0.456, 0.406)
|
| 176 |
+
std = (0.229, 0.224, 0.225)
|
| 177 |
|
| 178 |
# Training pipeline typically uses RandomResizedCrop and horizontal flip.
|
| 179 |
+
# 학습 파이프라인은 보통 RandomResizedCrop과 좌우반전을 사용.
|
| 180 |
if is_training:
|
| 181 |
return transforms.Compose(
|
| 182 |
[
|
| 183 |
+
# transforms.RandomResizedCrop(224),
|
| 184 |
+
# transforms.RandomHorizontalFlip(p=0.5),
|
| 185 |
+
transforms.Resize(224),
|
| 186 |
transforms.ToTensor(),
|
| 187 |
transforms.Normalize(mean=mean, std=std),
|
| 188 |
]
|
| 189 |
)
|
| 190 |
|
| 191 |
# Inference pipeline typically uses Resize(256) + CenterCrop(224).
|
| 192 |
+
# 추론 파이프라인은 보통 Resize(256) + CenterCrop(224)를 사용.
|
| 193 |
return transforms.Compose(
|
| 194 |
[
|
| 195 |
transforms.Resize(256),
|
| 196 |
+
# transforms.CenterCrop(224),
|
| 197 |
transforms.ToTensor(),
|
| 198 |
transforms.Normalize(mean=mean, std=std),
|
| 199 |
]
|
|
|
|
| 206 |
def to_dict(self) -> dict[str, Any]:
|
| 207 |
"""
|
| 208 |
Return a JSON-serializable dict for preprocessor_config.json.
|
| 209 |
+
preprocessor_config.json에 들어갈 JSON 직렬화 dict를 반환.
|
| 210 |
|
| 211 |
Important: do not leak runtime objects into the serialized dict.
|
| 212 |
+
중요: 런타임 객체가 직렬화 dict에 섞이면 안 됨.
|
| 213 |
"""
|
| 214 |
# ImageProcessingMixin.to_dict() adds metadata such as image_processor_type/auto_map.
|
| 215 |
# ImageProcessingMixin.to_dict()는 image_processor_type/auto_map 같은 메타를 추가합니다.
|
| 216 |
d = super().to_dict()
|
| 217 |
|
| 218 |
# Force minimal stable fields for long-term compatibility.
|
| 219 |
+
# 장기 호환을 위해 최소 안정 필드를 강제로 지정.
|
| 220 |
+
d["image_processor_type"] = self.__class__.__name__
|
| 221 |
d["backbone_name_or_path"] = self.backbone_name_or_path
|
| 222 |
d["is_training"] = self.is_training
|
| 223 |
+
d["use_fast"] = self.use_fast
|
| 224 |
|
| 225 |
# Remove any runtime-only fields defensively.
|
| 226 |
+
# 런타임 전용 필드는 보수적으로 제거.
|
| 227 |
for key in ["_meta", "_delegate", "_timm_transform", "_torchvision_transform"]:
|
| 228 |
d.pop(key, None)
|
| 229 |
|
|
|
|
| 233 |
def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs):
|
| 234 |
"""
|
| 235 |
Standard load path used by BaseImageProcessor / AutoImageProcessor.
|
| 236 |
+
BaseImageProcessor / AutoImageProcessor가 사용하는 표준 로드 경로임.
|
| 237 |
"""
|
| 238 |
backbone = image_processor_dict.get("backbone_name_or_path", None)
|
| 239 |
if backbone is None:
|
| 240 |
raise ValueError("preprocessor_config.json missing key: backbone_name_or_path")
|
| 241 |
|
| 242 |
is_training = bool(image_processor_dict.get("is_training", False))
|
| 243 |
+
use_fast = bool(image_processor_dict.get("use_fast", False))
|
| 244 |
|
| 245 |
return cls(
|
| 246 |
backbone_name_or_path=backbone,
|
|
|
|
| 253 |
def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
|
| 254 |
"""
|
| 255 |
Fallback path if AutoImageProcessor calls class.from_pretrained directly.
|
| 256 |
+
AutoImageProcessor가 class.from_pretrained를 직접 호출하는 경우를 대비한 메서드.
|
| 257 |
|
| 258 |
Strategy:
|
| 259 |
전략:
|
| 260 |
|
| 261 |
- Read config.json via AutoConfig and recover backbone_name_or_path.
|
| 262 |
+
AutoConfig로 config.json을 읽고 backbone_name_or_path를 복구.
|
| 263 |
"""
|
| 264 |
|
| 265 |
# is_training is runtime-only and should default to False for inference/serving.
|
| 266 |
+
# is_training은 런타임 전용이며 추론/서빙 기본값은 False 임.
|
| 267 |
#
|
| 268 |
# IMPORTANT:
|
| 269 |
+
# - use_fast는 kwargs로 전달될 수 있으므로, 있으면 반영.
|
| 270 |
use_fast = bool(kwargs.pop("use_fast", False))
|
| 271 |
|
| 272 |
kwargs.pop("trust_remote_code", None)
|
|
|
|
| 287 |
@staticmethod
|
| 288 |
def _ensure_list(images: Any) -> list[Any]:
|
| 289 |
# Normalize scalar image input to a list for uniform processing.
|
| 290 |
+
# 단일 입력을 리스트로 정규화하여 동일한 처리 경로를 사용.
|
| 291 |
if isinstance(images, (list, tuple)):
|
| 292 |
return list(images)
|
| 293 |
return [images]
|
|
|
|
| 295 |
@staticmethod
|
| 296 |
def _to_pil_rgb(x: Any):
|
| 297 |
# Convert common image inputs into PIL RGB images.
|
| 298 |
+
# 일반적인 입력을 PIL RGB 이미지로 변환.
|
| 299 |
from PIL import Image as PILImage
|
| 300 |
|
| 301 |
if isinstance(x, PILImage.Image):
|
|
|
|
| 312 |
) -> dict[str, Any]:
|
| 313 |
"""
|
| 314 |
Convert images into {"pixel_values": Tensor/ndarray}.
|
| 315 |
+
이미지를 {"pixel_values": Tensor/ndarray}로 변환.
|
| 316 |
"""
|
| 317 |
images = self._ensure_list(images)
|
| 318 |
|
| 319 |
# Rebuild runtime if needed (e.g., right after deserialization).
|
| 320 |
+
# 직렬화 복원 직후 등 런타임이 비어있을 수 있으므로 재구성.
|
| 321 |
if (self._delegate is None) and (self._timm_transform is None) and (self._torchvision_transform is None):
|
| 322 |
self._build_runtime()
|
| 323 |
|
| 324 |
# timm path: PIL -> torch.Tensor(C,H,W) normalized float32.
|
| 325 |
+
# timm 경로: PIL -> torch.Tensor(C,H,W) 정규화 float32.
|
| 326 |
if self._timm_transform is not None:
|
| 327 |
pv: list[torch.Tensor] = []
|
| 328 |
for im in images:
|
|
|
|
| 335 |
return self._format_return(pixel_values, return_tensors)
|
| 336 |
|
| 337 |
# torchvision path: PIL -> torch.Tensor(C,H,W) normalized float32.
|
| 338 |
+
# torchvision 경로: PIL -> torch.Tensor(C,H,W) 정규화 float32.
|
| 339 |
if self._torchvision_transform is not None:
|
| 340 |
pv: list[torch.Tensor] = []
|
| 341 |
for im in images:
|
|
|
|
| 348 |
return self._format_return(pixel_values, return_tensors)
|
| 349 |
|
| 350 |
# transformers delegate path: rely on official processor behavior.
|
| 351 |
+
# transformers 위임 경로: 공식 processor 동작을 그대로 사용.
|
| 352 |
if self._delegate is None:
|
| 353 |
raise RuntimeError("Processor runtime not built: delegate is None and no transforms are available.")
|
| 354 |
|
|
|
|
| 358 |
def _format_return(pixel_values: torch.Tensor, return_tensors: str | TensorType | None) -> dict[str, Any]:
|
| 359 |
"""
|
| 360 |
Format pixel_values according to return_tensors.
|
| 361 |
+
return_tensors에 맞춰 pixel_values 반환 포맷을 변환.
|
| 362 |
"""
|
| 363 |
if return_tensors is None or return_tensors in ("pt", TensorType.PYTORCH):
|
| 364 |
return {"pixel_values": pixel_values}
|
|
|
|
| 368 |
|
| 369 |
|
| 370 |
# Register this processor for AutoImageProcessor resolution.
|
| 371 |
+
# AutoImageProcessor 해석을 위해 이 processor를 등록.
|
| 372 |
if __name__ != "__main__":
|
| 373 |
BackboneMLPHead224ImageProcessor.register_for_auto_class("AutoImageProcessor")
|
models/microsoft__resnet-50/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 96388660
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4b46be9524952c57179580e6e9728d525676dc8d1f2cf184315277e2f57bce90
|
| 3 |
size 96388660
|
models/microsoft__swin-tiny-patch4-window7-224/config.json
CHANGED
|
@@ -24,7 +24,7 @@
|
|
| 24 |
"num_labels": 3,
|
| 25 |
"transformers_version": "5.1.0",
|
| 26 |
"ds_provenance": {
|
| 27 |
-
"created_at": "
|
| 28 |
"repo_id": "dsaint31/bb_mlp_224",
|
| 29 |
"subdir": "models/microsoft__swin-tiny-patch4-window7-224",
|
| 30 |
"wrapper_class": "BackboneWithMLPHeadForImageClassification",
|
|
|
|
| 24 |
"num_labels": 3,
|
| 25 |
"transformers_version": "5.1.0",
|
| 26 |
"ds_provenance": {
|
| 27 |
+
"created_at": "20260212_202546",
|
| 28 |
"repo_id": "dsaint31/bb_mlp_224",
|
| 29 |
"subdir": "models/microsoft__swin-tiny-patch4-window7-224",
|
| 30 |
"wrapper_class": "BackboneWithMLPHeadForImageClassification",
|
models/microsoft__swin-tiny-patch4-window7-224/ds_model.py
CHANGED
|
@@ -94,14 +94,14 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 94 |
|
| 95 |
def __init__(self, config: BackboneMLPHeadConfig):
|
| 96 |
# PreTrainedModel expects a config object and stores it internally.
|
| 97 |
-
# PreTrainedModel은 config 객체를 받아 내부에
|
| 98 |
super().__init__(config)
|
| 99 |
|
| 100 |
# Fail-fast: the model is not meant to be instantiated without a valid backbone id.
|
| 101 |
-
# fail-fast: 유효한 backbone id 없이 모델을 만드는 사용 시나리오는 허용하지
|
| 102 |
#
|
| 103 |
# Note: Transformers may create configs with no args, but models are conventionally created with configs.
|
| 104 |
-
# 참고: Transformers는 config 무인자 생성이 있을 수 있으나, 모델은 관례적으로 config를 받아
|
| 105 |
if config.backbone_name_or_path is None:
|
| 106 |
raise ValueError(
|
| 107 |
"config.backbone_name_or_path is None. "
|
|
@@ -109,10 +109,10 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 109 |
)
|
| 110 |
|
| 111 |
# Fail-fast: training/inference requires a positive number of labels.
|
| 112 |
-
# fail-fast: 학습/추론은 num_labels가 양수여야
|
| 113 |
#
|
| 114 |
# Config may exist in a minimal form for internal serialization paths, but the model should not.
|
| 115 |
-
# config는 내부 직렬화 경로에서 최소 형태로 존재할 수 있으나 모델은
|
| 116 |
if int(getattr(config, "num_labels", 0)) <= 0:
|
| 117 |
raise ValueError(
|
| 118 |
f"config.num_labels must be > 0, got {getattr(config, 'num_labels', None)}. "
|
|
@@ -120,17 +120,17 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 120 |
)
|
| 121 |
|
| 122 |
# Meta is a single source of truth for extraction and fine-tuning rules.
|
| 123 |
-
# meta는 feature 추출 및 미세조정 규칙의 단일
|
| 124 |
-
|
| 125 |
# Prefer config.backbone_meta to keep Hub runtime self-contained.
|
| 126 |
self._meta = _resolve_backbone_meta(config, fallback_table=BACKBONE_META)
|
| 127 |
|
| 128 |
# Backbone skeleton is always created without pretrained weights.
|
| 129 |
-
# backbone skeleton은 항상 pretrained weight 없이
|
| 130 |
self.backbone = self._build_backbone_skeleton(config.backbone_name_or_path)
|
| 131 |
|
| 132 |
# Head shape is driven by meta feat_dim and config.num_labels.
|
| 133 |
-
# head shape은 meta의 feat_dim과 config.num_labels로
|
| 134 |
self.classifier = MLPHead(
|
| 135 |
in_dim=int(self._meta["feat_dim"]),
|
| 136 |
num_labels=int(config.num_labels),
|
|
@@ -139,16 +139,20 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 139 |
)
|
| 140 |
|
| 141 |
# HF initialization hook, but we override init_weights to initialize head-only.
|
| 142 |
-
# HF 초기화 훅이지만 init_weights를 override하여 head만
|
| 143 |
self.post_init()
|
| 144 |
|
| 145 |
def init_weights(self):
|
| 146 |
"""
|
| 147 |
Initialize only the head to avoid touching the backbone skeleton.
|
| 148 |
-
backbone skeleton을 건드리지 않기 위해 head만
|
| 149 |
|
| 150 |
HF's default init may traverse the entire module tree, which is undesirable here.
|
| 151 |
-
HF 기본 init은 전체 모듈 트리를 순회할 수 있어
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
"""
|
| 153 |
if getattr(self, "classifier", None) is not None:
|
| 154 |
self.classifier.apply(self._init_weights)
|
|
@@ -160,7 +164,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 160 |
# ----------------------------
|
| 161 |
def _build_backbone_skeleton(self, backbone_id: str) -> nn.Module:
|
| 162 |
# Meta decides which loader path to use.
|
| 163 |
-
# meta가 어떤 로더 경로를 사용할지
|
| 164 |
meta = self._meta if backbone_id == self.config.backbone_name_or_path else BACKBONE_META.get(backbone_id)
|
| 165 |
if meta is None:
|
| 166 |
raise KeyError(f"Unknown backbone_id={backbone_id}. Provide backbone_meta in config or extend BACKBONE_META.")
|
|
@@ -174,14 +178,14 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 174 |
return self._build_torchvision_densenet_skeleton(backbone_id)
|
| 175 |
|
| 176 |
# For transformers backbones: build a random-weight skeleton from config only.
|
| 177 |
-
# transformers 백본: config로부터 랜덤 초기화 skeleton만
|
| 178 |
bb_cfg = AutoConfig.from_pretrained(backbone_id)
|
| 179 |
return AutoModel.from_config(bb_cfg)
|
| 180 |
|
| 181 |
@staticmethod
|
| 182 |
def _build_timm_densenet_skeleton(hf_repo_id: str) -> nn.Module:
|
| 183 |
# timm is an optional dependency and should be imported lazily.
|
| 184 |
-
# timm은 옵션 의존성이므로 지연 import
|
| 185 |
try:
|
| 186 |
import timm
|
| 187 |
except Exception as e:
|
|
@@ -190,7 +194,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 190 |
) from e
|
| 191 |
|
| 192 |
# Build structure only (pretrained=False) and remove classifier head (num_classes=0).
|
| 193 |
-
# 구조만 생성(pretrained=False)하고 분류기 head는 제거(num_classes=0)
|
| 194 |
return timm.create_model(
|
| 195 |
f"hf_hub:{hf_repo_id}",
|
| 196 |
pretrained=False,
|
|
@@ -200,12 +204,12 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 200 |
@staticmethod
|
| 201 |
def _build_torchvision_densenet_skeleton(model_id: str) -> nn.Module:
|
| 202 |
# This project intentionally supports only torchvision/densenet121 in the 224 whitelist.
|
| 203 |
-
# 이 프로젝트는 224 화이트리스트에서 torchvision/densenet121만 의도적으로
|
| 204 |
if model_id != "torchvision/densenet121":
|
| 205 |
raise ValueError(f"Unsupported torchvision DenseNet id (224 whitelist only): {model_id}")
|
| 206 |
|
| 207 |
# Build structure only (weights=None) to avoid implicit pretrained loading.
|
| 208 |
-
# implicit pretrained 로드를 피하기 위해 구조만 생성(weights=None)
|
| 209 |
m = tv_models.densenet121(weights=None)
|
| 210 |
return m
|
| 211 |
|
|
@@ -222,10 +226,10 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 222 |
):
|
| 223 |
"""
|
| 224 |
Fresh-start only: inject pretrained backbone weights into the skeleton.
|
| 225 |
-
fresh-start 전용: skeleton backbone에 pretrained 가중치를
|
| 226 |
|
| 227 |
Do NOT call this after from_pretrained() because it would overwrite checkpoint weights.
|
| 228 |
-
from_pretrained() 이후 호출하면 체크포인트 가중치를 덮어쓰므로
|
| 229 |
"""
|
| 230 |
bb = self.config.backbone_name_or_path
|
| 231 |
meta = self._meta
|
|
@@ -240,7 +244,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 240 |
return
|
| 241 |
|
| 242 |
# For transformers backbones, load a reference pretrained model and copy weights into our skeleton.
|
| 243 |
-
# transformers 백본은 reference pretrained 모델을 로드한 뒤 skeleton에 가중치를
|
| 244 |
ref = AutoModel.from_pretrained(
|
| 245 |
bb,
|
| 246 |
low_cpu_mem_usage=low_cpu_mem_usage,
|
|
@@ -248,18 +252,18 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 248 |
)
|
| 249 |
|
| 250 |
# strict=False is used to tolerate harmless key differences across minor versions.
|
| 251 |
-
# strict=False는 마이너 버전 차이로 인한 무해한 키 차이를 허용하기 위해
|
| 252 |
self.backbone.load_state_dict(ref.state_dict(), strict=False)
|
| 253 |
del ref
|
| 254 |
|
| 255 |
@torch.no_grad()
|
| 256 |
def _load_timm_pretrained_into_skeleton_(self, hf_repo_id: str):
|
| 257 |
# timm must be present for timm backbones.
|
| 258 |
-
# timm
|
| 259 |
import timm
|
| 260 |
|
| 261 |
# Create a pretrained reference model and copy its weights strictly.
|
| 262 |
-
# pretrained reference 모델을 만들고 가중치를 strict하게
|
| 263 |
ref = timm.create_model(
|
| 264 |
f"hf_hub:{hf_repo_id}",
|
| 265 |
pretrained=True,
|
|
@@ -272,12 +276,12 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 272 |
@torch.no_grad()
|
| 273 |
def _load_torchvision_pretrained_into_skeleton_(self, model_id: str):
|
| 274 |
# This project intentionally supports only torchvision/densenet121 in the 224 whitelist.
|
| 275 |
-
# 이 프로젝트는 224 화이트리스트에서 torchvision/densenet121만
|
| 276 |
if model_id != "torchvision/densenet121":
|
| 277 |
raise ValueError(f"Unsupported torchvision DenseNet id (224 whitelist only): {model_id}")
|
| 278 |
|
| 279 |
# Use torchvision's default pretrained weights for densenet121.
|
| 280 |
-
# torchvision의 densenet121 기본 pretrained weights를
|
| 281 |
ref = tv_models.densenet121(weights=tv_models.DenseNet121_Weights.DEFAULT).eval()
|
| 282 |
|
| 283 |
self.backbone.load_state_dict(ref.state_dict(), strict=True)
|
|
@@ -290,7 +294,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 290 |
@staticmethod
|
| 291 |
def _pool_or_gap(outputs) -> torch.Tensor:
|
| 292 |
# Some transformers vision CNNs provide pooler_output explicitly.
|
| 293 |
-
# 일부 transformers vision CNN은 pooler_output을 명시적으로
|
| 294 |
if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
|
| 295 |
x = outputs.pooler_output
|
| 296 |
if x.dim() == 2:
|
|
@@ -300,7 +304,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 300 |
raise RuntimeError(f"Unexpected pooler_output shape: {tuple(x.shape)}")
|
| 301 |
|
| 302 |
# Otherwise we expect a CNN-style last_hidden_state=(B,C,H,W) and apply GAP.
|
| 303 |
-
# 그렇지 않으면 CNN 스타일 last_hidden_state=(B,C,H,W)를 기대하고 GAP을
|
| 304 |
x = outputs.last_hidden_state
|
| 305 |
if x.dim() == 4:
|
| 306 |
return x.mean(dim=(2, 3))
|
|
@@ -312,29 +316,29 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 312 |
|
| 313 |
def _extract_features(self, outputs, pixel_values: Optional[torch.Tensor] = None) -> torch.Tensor:
|
| 314 |
# Feature rule is defined by BACKBONE_META and must remain stable across saves/loads.
|
| 315 |
-
# feature 규칙은 BACKBONE_META로 정의되며 저장/로드 간
|
| 316 |
rule = self._meta["feat_rule"]
|
| 317 |
|
| 318 |
if rule == "cls":
|
| 319 |
# ViT-style: use CLS token embedding from last_hidden_state.
|
| 320 |
-
# ViT 스타일: last_hidden_state에서 CLS 토큰 임베딩을
|
| 321 |
return outputs.last_hidden_state[:, 0, :]
|
| 322 |
|
| 323 |
if rule == "pool_or_mean":
|
| 324 |
# Swin-style: prefer pooler_output if present, else mean-pool over tokens.
|
| 325 |
-
# Swin 스타일: pooler_output이 있으면 우선 사용하고, 없으면 토큰 평균 풀링을
|
| 326 |
if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
|
| 327 |
return outputs.pooler_output
|
| 328 |
return outputs.last_hidden_state.mean(dim=1)
|
| 329 |
|
| 330 |
if rule == "pool_or_gap":
|
| 331 |
# CNN-style: use pooler_output if present, else GAP over spatial dims.
|
| 332 |
-
# CNN 스타일: pooler_output이 있으면 사용하고, 없으면 공간 차원 GAP을
|
| 333 |
return self._pool_or_gap(outputs)
|
| 334 |
|
| 335 |
if rule == "timm_gap":
|
| 336 |
# timm forward_features returns a feature map (B,C,H,W) which we GAP to (B,C).
|
| 337 |
-
# timm forward_features는 (B,C,H,W) feature map을 반환하며 이를 GAP으로 (B,C)로
|
| 338 |
if not isinstance(outputs, torch.Tensor):
|
| 339 |
raise TypeError(f"timm_gap expects Tensor features, got {type(outputs)}")
|
| 340 |
if outputs.dim() != 4:
|
|
@@ -343,7 +347,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 343 |
|
| 344 |
if rule == "torchvision_densenet_gap":
|
| 345 |
# torchvision DenseNet features are feature maps (B,C,H,W) and require GAP.
|
| 346 |
-
# torchvision DenseNet features는 (B,C,H,W) feature map이며 GAP이
|
| 347 |
if not isinstance(outputs, torch.Tensor):
|
| 348 |
raise TypeError(f"torchvision_densenet_gap expects Tensor, got {type(outputs)}")
|
| 349 |
if outputs.dim() != 4:
|
|
@@ -362,7 +366,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 362 |
**kwargs,
|
| 363 |
):
|
| 364 |
# Type decides the backbone forward path and output format.
|
| 365 |
-
# type이 backbone forward 경로 및 출력 포맷을
|
| 366 |
t = self._meta["type"]
|
| 367 |
|
| 368 |
if t == "timm_densenet":
|
|
@@ -394,7 +398,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 394 |
|
| 395 |
else:
|
| 396 |
# Transformers vision models are called with pixel_values and return ModelOutput.
|
| 397 |
-
# transformers vision 모델은 pixel_values로 호출되며 ModelOutput을
|
| 398 |
outputs = self.backbone(
|
| 399 |
pixel_values=pixel_values,
|
| 400 |
output_attentions=output_attentions,
|
|
@@ -407,13 +411,13 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 407 |
attentions = getattr(outputs, "attentions", None)
|
| 408 |
|
| 409 |
# Classifier consumes (B, feat_dim) and returns logits (B, num_labels).
|
| 410 |
-
# classifier는 (B, feat_dim)을 받아 logits (B, num_labels)를
|
| 411 |
logits = self.classifier(feats)
|
| 412 |
|
| 413 |
loss = None
|
| 414 |
if labels is not None:
|
| 415 |
# Cross entropy expects labels as class indices in [0, num_labels).
|
| 416 |
-
# cross entropy는 labels가 [0, num_labels) 범위의 class index이길
|
| 417 |
loss = F.cross_entropy(logits, labels)
|
| 418 |
|
| 419 |
if not return_dict:
|
|
@@ -434,14 +438,14 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 434 |
# ============================================================
|
| 435 |
def _set_requires_grad(module: nn.Module, flag: bool):
|
| 436 |
# Toggle requires_grad for all parameters in a module.
|
| 437 |
-
# 모듈의 모든 파라미터에 대해 requires_grad를
|
| 438 |
for p in module.parameters():
|
| 439 |
p.requires_grad = flag
|
| 440 |
|
| 441 |
|
| 442 |
def set_bn_eval(module: nn.Module):
|
| 443 |
# Put BatchNorm layers into eval mode to freeze running stats.
|
| 444 |
-
# BatchNorm 레이어를 eval 모드로 두어 running stats를
|
| 445 |
for m in module.modules():
|
| 446 |
if isinstance(m, (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.SyncBatchNorm)):
|
| 447 |
m.eval()
|
|
@@ -449,7 +453,7 @@ def set_bn_eval(module: nn.Module):
|
|
| 449 |
|
| 450 |
def freeze_backbone(model: BackboneWithMLPHeadForImageClassification, freeze_bn: bool = True):
|
| 451 |
# Stage1: freeze backbone and train only the head.
|
| 452 |
-
# stage1: backbone을 freeze하고 head만
|
| 453 |
_set_requires_grad(model.backbone, False)
|
| 454 |
_set_requires_grad(model.classifier, True)
|
| 455 |
|
|
@@ -460,7 +464,7 @@ def freeze_backbone(model: BackboneWithMLPHeadForImageClassification, freeze_bn:
|
|
| 460 |
|
| 461 |
def finetune_train_mode(model: BackboneWithMLPHeadForImageClassification, keep_bn_eval: bool = True):
|
| 462 |
# Stage2: train mode, optionally keeping BN layers in eval for stability.
|
| 463 |
-
# stage2: train 모드로 두되 안정성을 위해 BN을 eval로 유지할 수
|
| 464 |
model.train()
|
| 465 |
meta = getattr(model, "_meta", None) or getattr(model.config, "backbone_meta", None)
|
| 466 |
if keep_bn_eval and meta.get("has_bn", False):
|
|
@@ -469,7 +473,7 @@ def finetune_train_mode(model: BackboneWithMLPHeadForImageClassification, keep_b
|
|
| 469 |
|
| 470 |
def trainable_summary(model: nn.Module):
|
| 471 |
# Print a compact summary of trainable parameters.
|
| 472 |
-
# 학습 가능 파라미터 요약을 간단히
|
| 473 |
total = sum(p.numel() for p in model.parameters())
|
| 474 |
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
| 475 |
ratio = trainable / total if total > 0 else 0.0
|
|
@@ -483,7 +487,7 @@ def unfreeze_last_stage(
|
|
| 483 |
keep_bn_eval: bool = True,
|
| 484 |
):
|
| 485 |
# This utility implements BACKBONE_META['unfreeze']=="last_n" across supported backbones.
|
| 486 |
-
# 이 유틸은 지원 백본들에 대해 BACKBONE_META['unfreeze']=="last_n"을
|
| 487 |
freeze_backbone(model, freeze_bn=keep_bn_eval)
|
| 488 |
|
| 489 |
n = int(last_n)
|
|
@@ -498,7 +502,7 @@ def unfreeze_last_stage(
|
|
| 498 |
|
| 499 |
if bb_type == "vit":
|
| 500 |
# ViT blocks live under backbone.encoder.layer in the transformers implementation.
|
| 501 |
-
# ViT 블록은 transformers 구현에서 backbone.encoder.layer 아래에
|
| 502 |
blocks = list(model.backbone.encoder.layer)
|
| 503 |
for blk in blocks[-n:]:
|
| 504 |
_set_requires_grad(blk, True)
|
|
@@ -506,7 +510,7 @@ def unfreeze_last_stage(
|
|
| 506 |
|
| 507 |
if bb_type == "swin":
|
| 508 |
# Swin blocks are nested by stages and blocks; we flatten and unfreeze last n blocks.
|
| 509 |
-
# Swin 블록은 stage와 block으로 중첩되어 있어 펼친 후 마지막 n개를 unfreeze
|
| 510 |
stages = list(model.backbone.encoder.layers)
|
| 511 |
blocks: List[nn.Module] = []
|
| 512 |
for st in stages:
|
|
@@ -517,7 +521,7 @@ def unfreeze_last_stage(
|
|
| 517 |
|
| 518 |
if bb_type == "resnet":
|
| 519 |
# ResNet uses layer1..layer4 stages; we unfreeze at block granularity.
|
| 520 |
-
# ResNet은 layer1..layer4 stage를 사용하며 block 단위로 unfreeze
|
| 521 |
bb = model.backbone
|
| 522 |
for name in ("layer1", "layer2", "layer3", "layer4"):
|
| 523 |
if not hasattr(bb, name):
|
|
@@ -538,7 +542,7 @@ def unfreeze_last_stage(
|
|
| 538 |
|
| 539 |
if bb_type == "efficientnet":
|
| 540 |
# EfficientNet in transformers exposes features; we unfreeze from the tail blocks.
|
| 541 |
-
# transformers EfficientNet은 features를 노출하며 뒤쪽 블록부터 unfreeze
|
| 542 |
bb = model.backbone
|
| 543 |
if not hasattr(bb, "features"):
|
| 544 |
raise RuntimeError("Unexpected EfficientNet structure: missing features")
|
|
@@ -556,7 +560,7 @@ def unfreeze_last_stage(
|
|
| 556 |
|
| 557 |
if bb_type in ("timm_densenet", "torchvision_densenet"):
|
| 558 |
# DenseNet exposes a .features module with named blocks; we unfreeze last n submodules.
|
| 559 |
-
# DenseNet은 .features 모듈에 블록들이 이름으로 존재하며 마지막 n개 서브모듈을 unfreeze
|
| 560 |
bb = model.backbone
|
| 561 |
if not hasattr(bb, "features"):
|
| 562 |
raise RuntimeError("Unexpected DenseNet: missing features")
|
|
@@ -575,7 +579,7 @@ def unfreeze_last_stage(
|
|
| 575 |
|
| 576 |
def _denselayers(db: nn.Module) -> List[nn.Module]:
|
| 577 |
# Dense blocks contain multiple DenseLayer children; we return them for fine-grained unfreezing.
|
| 578 |
-
# denseblock은 DenseLayer 자식들을 가지므로 세밀한 unfreeze를 위해 이를
|
| 579 |
return list(db.children())
|
| 580 |
|
| 581 |
blocks: List[nn.Module] = []
|
|
@@ -600,5 +604,5 @@ def unfreeze_last_stage(
|
|
| 600 |
# register
|
| 601 |
# -------------------------
|
| 602 |
# Register for AutoModelForImageClassification so from_pretrained can resolve this custom class.
|
| 603 |
-
# from_pretrained가 이 커스텀 클래스를 해석할 수 있도록 AutoModelForImageClassification에
|
| 604 |
BackboneWithMLPHeadForImageClassification.register_for_auto_class("AutoModelForImageClassification")
|
|
|
|
| 94 |
|
| 95 |
def __init__(self, config: BackboneMLPHeadConfig):
|
| 96 |
# PreTrainedModel expects a config object and stores it internally.
|
| 97 |
+
# PreTrainedModel은 config 객체를 받아 내부에 저장함.
|
| 98 |
super().__init__(config)
|
| 99 |
|
| 100 |
# Fail-fast: the model is not meant to be instantiated without a valid backbone id.
|
| 101 |
+
# fail-fast: 유효한 backbone id 없이 모델을 만드는 사용 시나리오는 허용하지 않음 - fast fail.
|
| 102 |
#
|
| 103 |
# Note: Transformers may create configs with no args, but models are conventionally created with configs.
|
| 104 |
+
# 참고: Transformers는 config 무인자 생성이 있을 수 있으나, 모델은 관례적으로 config를 받아 생성.
|
| 105 |
if config.backbone_name_or_path is None:
|
| 106 |
raise ValueError(
|
| 107 |
"config.backbone_name_or_path is None. "
|
|
|
|
| 109 |
)
|
| 110 |
|
| 111 |
# Fail-fast: training/inference requires a positive number of labels.
|
| 112 |
+
# fail-fast: 학습/추론은 num_labels가 양수여야 함.
|
| 113 |
#
|
| 114 |
# Config may exist in a minimal form for internal serialization paths, but the model should not.
|
| 115 |
+
# config는 내부 직렬화 경로에서 최소 형태로 존재할 수 있으나 모델은 해당 없음.
|
| 116 |
if int(getattr(config, "num_labels", 0)) <= 0:
|
| 117 |
raise ValueError(
|
| 118 |
f"config.num_labels must be > 0, got {getattr(config, 'num_labels', None)}. "
|
|
|
|
| 120 |
)
|
| 121 |
|
| 122 |
# Meta is a single source of truth for extraction and fine-tuning rules.
|
| 123 |
+
# meta는 feature 추출 및 미세조정 규칙의 단일 기준.
|
| 124 |
+
# Resolve backbone meta from config (preferred) or fallback table (for backward compatibility).
|
| 125 |
# Prefer config.backbone_meta to keep Hub runtime self-contained.
|
| 126 |
self._meta = _resolve_backbone_meta(config, fallback_table=BACKBONE_META)
|
| 127 |
|
| 128 |
# Backbone skeleton is always created without pretrained weights.
|
| 129 |
+
# backbone skeleton은 항상 pretrained weight 없이 생성.
|
| 130 |
self.backbone = self._build_backbone_skeleton(config.backbone_name_or_path)
|
| 131 |
|
| 132 |
# Head shape is driven by meta feat_dim and config.num_labels.
|
| 133 |
+
# head shape은 meta의 feat_dim과 config.num_labels로 결정.
|
| 134 |
self.classifier = MLPHead(
|
| 135 |
in_dim=int(self._meta["feat_dim"]),
|
| 136 |
num_labels=int(config.num_labels),
|
|
|
|
| 139 |
)
|
| 140 |
|
| 141 |
# HF initialization hook, but we override init_weights to initialize head-only.
|
| 142 |
+
# HF 초기화 훅이지만 init_weights를 override하여 head만 초기화하도록 변경.
|
| 143 |
self.post_init()
|
| 144 |
|
| 145 |
def init_weights(self):
|
| 146 |
"""
|
| 147 |
Initialize only the head to avoid touching the backbone skeleton.
|
| 148 |
+
backbone skeleton을 건드리지 않기 위해 head만 초기화.
|
| 149 |
|
| 150 |
HF's default init may traverse the entire module tree, which is undesirable here.
|
| 151 |
+
HF 기본 init은 전체 모듈 트리를 순회할 수 있어 여기서 그대로 사용하기 부적절.
|
| 152 |
+
|
| 153 |
+
초기 설계에서 __init__ 내부에서 backbone의 가중치 로드를 수행함(편리를 위해).
|
| 154 |
+
이 경우, HF의 post_init()으로 인해 해당 로드가 취소되는 경우가 존재(timm, torchvision 등의 백본).
|
| 155 |
+
때문에 이를 오버라이드 하여 classifier만 초기화 하도록 변경함.
|
| 156 |
"""
|
| 157 |
if getattr(self, "classifier", None) is not None:
|
| 158 |
self.classifier.apply(self._init_weights)
|
|
|
|
| 164 |
# ----------------------------
|
| 165 |
def _build_backbone_skeleton(self, backbone_id: str) -> nn.Module:
|
| 166 |
# Meta decides which loader path to use.
|
| 167 |
+
# meta가 어떤 로더 경로를 사용할지 결정.
|
| 168 |
meta = self._meta if backbone_id == self.config.backbone_name_or_path else BACKBONE_META.get(backbone_id)
|
| 169 |
if meta is None:
|
| 170 |
raise KeyError(f"Unknown backbone_id={backbone_id}. Provide backbone_meta in config or extend BACKBONE_META.")
|
|
|
|
| 178 |
return self._build_torchvision_densenet_skeleton(backbone_id)
|
| 179 |
|
| 180 |
# For transformers backbones: build a random-weight skeleton from config only.
|
| 181 |
+
# transformers 백본: config로부터 랜덤 초기화 skeleton만 생성.
|
| 182 |
bb_cfg = AutoConfig.from_pretrained(backbone_id)
|
| 183 |
return AutoModel.from_config(bb_cfg)
|
| 184 |
|
| 185 |
@staticmethod
|
| 186 |
def _build_timm_densenet_skeleton(hf_repo_id: str) -> nn.Module:
|
| 187 |
# timm is an optional dependency and should be imported lazily.
|
| 188 |
+
# timm은 옵션 의존성이므로 지연 import 수행.
|
| 189 |
try:
|
| 190 |
import timm
|
| 191 |
except Exception as e:
|
|
|
|
| 194 |
) from e
|
| 195 |
|
| 196 |
# Build structure only (pretrained=False) and remove classifier head (num_classes=0).
|
| 197 |
+
# 구조만 생성(pretrained=False)하고 분류기 head는 제거(num_classes=0).
|
| 198 |
return timm.create_model(
|
| 199 |
f"hf_hub:{hf_repo_id}",
|
| 200 |
pretrained=False,
|
|
|
|
| 204 |
@staticmethod
|
| 205 |
def _build_torchvision_densenet_skeleton(model_id: str) -> nn.Module:
|
| 206 |
# This project intentionally supports only torchvision/densenet121 in the 224 whitelist.
|
| 207 |
+
# 이 프로젝트는 224 화이트리스트에서 torchvision/densenet121만 의도적으로 지원.
|
| 208 |
if model_id != "torchvision/densenet121":
|
| 209 |
raise ValueError(f"Unsupported torchvision DenseNet id (224 whitelist only): {model_id}")
|
| 210 |
|
| 211 |
# Build structure only (weights=None) to avoid implicit pretrained loading.
|
| 212 |
+
# implicit pretrained 로드를 피하기 위해 구조만 생성(weights=None).
|
| 213 |
m = tv_models.densenet121(weights=None)
|
| 214 |
return m
|
| 215 |
|
|
|
|
| 226 |
):
|
| 227 |
"""
|
| 228 |
Fresh-start only: inject pretrained backbone weights into the skeleton.
|
| 229 |
+
fresh-start 전용: skeleton backbone에 pretrained 가중치를 주입.
|
| 230 |
|
| 231 |
Do NOT call this after from_pretrained() because it would overwrite checkpoint weights.
|
| 232 |
+
from_pretrained() 이후 호출하면 체크포인트 가중치를 덮어쓰므로 주의할 것.
|
| 233 |
"""
|
| 234 |
bb = self.config.backbone_name_or_path
|
| 235 |
meta = self._meta
|
|
|
|
| 244 |
return
|
| 245 |
|
| 246 |
# For transformers backbones, load a reference pretrained model and copy weights into our skeleton.
|
| 247 |
+
# transformers 백본은 reference pretrained 모델을 로드한 뒤 skeleton에 가중치를 복사.
|
| 248 |
ref = AutoModel.from_pretrained(
|
| 249 |
bb,
|
| 250 |
low_cpu_mem_usage=low_cpu_mem_usage,
|
|
|
|
| 252 |
)
|
| 253 |
|
| 254 |
# strict=False is used to tolerate harmless key differences across minor versions.
|
| 255 |
+
# strict=False는 마이너 버전 차이로 인한 무해한 키 차이를 허용하기 위해 사용.
|
| 256 |
self.backbone.load_state_dict(ref.state_dict(), strict=False)
|
| 257 |
del ref
|
| 258 |
|
| 259 |
@torch.no_grad()
|
| 260 |
def _load_timm_pretrained_into_skeleton_(self, hf_repo_id: str):
|
| 261 |
# timm must be present for timm backbones.
|
| 262 |
+
# timm 백본에��� timm 설치가 필요.
|
| 263 |
import timm
|
| 264 |
|
| 265 |
# Create a pretrained reference model and copy its weights strictly.
|
| 266 |
+
# pretrained reference 모델을 만들고 가중치를 strict하게 복사.
|
| 267 |
ref = timm.create_model(
|
| 268 |
f"hf_hub:{hf_repo_id}",
|
| 269 |
pretrained=True,
|
|
|
|
| 276 |
@torch.no_grad()
|
| 277 |
def _load_torchvision_pretrained_into_skeleton_(self, model_id: str):
|
| 278 |
# This project intentionally supports only torchvision/densenet121 in the 224 whitelist.
|
| 279 |
+
# 이 프로젝트는 224 화이트리스트에서 torchvision/densenet121만 지원.
|
| 280 |
if model_id != "torchvision/densenet121":
|
| 281 |
raise ValueError(f"Unsupported torchvision DenseNet id (224 whitelist only): {model_id}")
|
| 282 |
|
| 283 |
# Use torchvision's default pretrained weights for densenet121.
|
| 284 |
+
# torchvision의 densenet121 기본 pretrained weights를 사용.
|
| 285 |
ref = tv_models.densenet121(weights=tv_models.DenseNet121_Weights.DEFAULT).eval()
|
| 286 |
|
| 287 |
self.backbone.load_state_dict(ref.state_dict(), strict=True)
|
|
|
|
| 294 |
@staticmethod
|
| 295 |
def _pool_or_gap(outputs) -> torch.Tensor:
|
| 296 |
# Some transformers vision CNNs provide pooler_output explicitly.
|
| 297 |
+
# 일부 transformers vision CNN은 pooler_output을 명시적으로 제공.
|
| 298 |
if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
|
| 299 |
x = outputs.pooler_output
|
| 300 |
if x.dim() == 2:
|
|
|
|
| 304 |
raise RuntimeError(f"Unexpected pooler_output shape: {tuple(x.shape)}")
|
| 305 |
|
| 306 |
# Otherwise we expect a CNN-style last_hidden_state=(B,C,H,W) and apply GAP.
|
| 307 |
+
# 그렇지 않으면 CNN 스타일 last_hidden_state=(B,C,H,W)를 기대하고 GAP을 적용.
|
| 308 |
x = outputs.last_hidden_state
|
| 309 |
if x.dim() == 4:
|
| 310 |
return x.mean(dim=(2, 3))
|
|
|
|
| 316 |
|
| 317 |
def _extract_features(self, outputs, pixel_values: Optional[torch.Tensor] = None) -> torch.Tensor:
|
| 318 |
# Feature rule is defined by BACKBONE_META and must remain stable across saves/loads.
|
| 319 |
+
# feature 규칙은 BACKBONE_META로 정의되며 저장/로드 간 안정적 동작을 위해 제한된 모델만 사용.
|
| 320 |
rule = self._meta["feat_rule"]
|
| 321 |
|
| 322 |
if rule == "cls":
|
| 323 |
# ViT-style: use CLS token embedding from last_hidden_state.
|
| 324 |
+
# ViT 스타일: last_hidden_state에서 CLS 토큰 임베딩을 사용.
|
| 325 |
return outputs.last_hidden_state[:, 0, :]
|
| 326 |
|
| 327 |
if rule == "pool_or_mean":
|
| 328 |
# Swin-style: prefer pooler_output if present, else mean-pool over tokens.
|
| 329 |
+
# Swin 스타일: pooler_output이 있으면 우선 사용하고, 없으면 토큰 평균 풀링을 사용.
|
| 330 |
if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
|
| 331 |
return outputs.pooler_output
|
| 332 |
return outputs.last_hidden_state.mean(dim=1)
|
| 333 |
|
| 334 |
if rule == "pool_or_gap":
|
| 335 |
# CNN-style: use pooler_output if present, else GAP over spatial dims.
|
| 336 |
+
# CNN 스타일: pooler_output이 있으면 사용하고, 없으면 공간 차원 GAP을 사용.
|
| 337 |
return self._pool_or_gap(outputs)
|
| 338 |
|
| 339 |
if rule == "timm_gap":
|
| 340 |
# timm forward_features returns a feature map (B,C,H,W) which we GAP to (B,C).
|
| 341 |
+
# timm forward_features는 (B,C,H,W) feature map을 반환하며 이를 GAP으로 (B,C)로 변환.
|
| 342 |
if not isinstance(outputs, torch.Tensor):
|
| 343 |
raise TypeError(f"timm_gap expects Tensor features, got {type(outputs)}")
|
| 344 |
if outputs.dim() != 4:
|
|
|
|
| 347 |
|
| 348 |
if rule == "torchvision_densenet_gap":
|
| 349 |
# torchvision DenseNet features are feature maps (B,C,H,W) and require GAP.
|
| 350 |
+
# torchvision DenseNet features는 (B,C,H,W) feature map이며 GAP이 필요.
|
| 351 |
if not isinstance(outputs, torch.Tensor):
|
| 352 |
raise TypeError(f"torchvision_densenet_gap expects Tensor, got {type(outputs)}")
|
| 353 |
if outputs.dim() != 4:
|
|
|
|
| 366 |
**kwargs,
|
| 367 |
):
|
| 368 |
# Type decides the backbone forward path and output format.
|
| 369 |
+
# type이 backbone forward 경로 및 출력 포맷을 결정.
|
| 370 |
t = self._meta["type"]
|
| 371 |
|
| 372 |
if t == "timm_densenet":
|
|
|
|
| 398 |
|
| 399 |
else:
|
| 400 |
# Transformers vision models are called with pixel_values and return ModelOutput.
|
| 401 |
+
# transformers vision 모델은 pixel_values로 호출되며 ModelOutput을 반환.
|
| 402 |
outputs = self.backbone(
|
| 403 |
pixel_values=pixel_values,
|
| 404 |
output_attentions=output_attentions,
|
|
|
|
| 411 |
attentions = getattr(outputs, "attentions", None)
|
| 412 |
|
| 413 |
# Classifier consumes (B, feat_dim) and returns logits (B, num_labels).
|
| 414 |
+
# classifier는 (B, feat_dim)을 받아 logits (B, num_labels)를 반환.
|
| 415 |
logits = self.classifier(feats)
|
| 416 |
|
| 417 |
loss = None
|
| 418 |
if labels is not None:
|
| 419 |
# Cross entropy expects labels as class indices in [0, num_labels).
|
| 420 |
+
# cross entropy는 labels가 [0, num_labels) 범위의 class index이길 기대함.
|
| 421 |
loss = F.cross_entropy(logits, labels)
|
| 422 |
|
| 423 |
if not return_dict:
|
|
|
|
| 438 |
# ============================================================
|
| 439 |
def _set_requires_grad(module: nn.Module, flag: bool):
|
| 440 |
# Toggle requires_grad for all parameters in a module.
|
| 441 |
+
# 모듈의 모든 파라미터에 대해 requires_grad를 토글.
|
| 442 |
for p in module.parameters():
|
| 443 |
p.requires_grad = flag
|
| 444 |
|
| 445 |
|
| 446 |
def set_bn_eval(module: nn.Module):
|
| 447 |
# Put BatchNorm layers into eval mode to freeze running stats.
|
| 448 |
+
# BatchNorm 레이어를 eval 모드로 두어 running stats를 고정.
|
| 449 |
for m in module.modules():
|
| 450 |
if isinstance(m, (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.SyncBatchNorm)):
|
| 451 |
m.eval()
|
|
|
|
| 453 |
|
| 454 |
def freeze_backbone(model: BackboneWithMLPHeadForImageClassification, freeze_bn: bool = True):
|
| 455 |
# Stage1: freeze backbone and train only the head.
|
| 456 |
+
# stage1: backbone을 freeze하고 head만 학습.
|
| 457 |
_set_requires_grad(model.backbone, False)
|
| 458 |
_set_requires_grad(model.classifier, True)
|
| 459 |
|
|
|
|
| 464 |
|
| 465 |
def finetune_train_mode(model: BackboneWithMLPHeadForImageClassification, keep_bn_eval: bool = True):
|
| 466 |
# Stage2: train mode, optionally keeping BN layers in eval for stability.
|
| 467 |
+
# stage2: train 모드로 두되 안정성을 위해 BN을 eval로 유지할 수 있음. (buffer 등을 유지하기 위해)
|
| 468 |
model.train()
|
| 469 |
meta = getattr(model, "_meta", None) or getattr(model.config, "backbone_meta", None)
|
| 470 |
if keep_bn_eval and meta.get("has_bn", False):
|
|
|
|
| 473 |
|
| 474 |
def trainable_summary(model: nn.Module):
|
| 475 |
# Print a compact summary of trainable parameters.
|
| 476 |
+
# 학습 가능 파라미터 요약을 간단히 출력.
|
| 477 |
total = sum(p.numel() for p in model.parameters())
|
| 478 |
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
| 479 |
ratio = trainable / total if total > 0 else 0.0
|
|
|
|
| 487 |
keep_bn_eval: bool = True,
|
| 488 |
):
|
| 489 |
# This utility implements BACKBONE_META['unfreeze']=="last_n" across supported backbones.
|
| 490 |
+
# 이 유틸은 지원 백본들에 대해 BACKBONE_META['unfreeze']=="last_n"을 구현.
|
| 491 |
freeze_backbone(model, freeze_bn=keep_bn_eval)
|
| 492 |
|
| 493 |
n = int(last_n)
|
|
|
|
| 502 |
|
| 503 |
if bb_type == "vit":
|
| 504 |
# ViT blocks live under backbone.encoder.layer in the transformers implementation.
|
| 505 |
+
# ViT 블록은 transformers 구현에서 backbone.encoder.layer 아래에 존재함.
|
| 506 |
blocks = list(model.backbone.encoder.layer)
|
| 507 |
for blk in blocks[-n:]:
|
| 508 |
_set_requires_grad(blk, True)
|
|
|
|
| 510 |
|
| 511 |
if bb_type == "swin":
|
| 512 |
# Swin blocks are nested by stages and blocks; we flatten and unfreeze last n blocks.
|
| 513 |
+
# Swin 블록은 stage와 block으로 중첩되어 있어 펼친 후 마지막 n개를 unfreeze.
|
| 514 |
stages = list(model.backbone.encoder.layers)
|
| 515 |
blocks: List[nn.Module] = []
|
| 516 |
for st in stages:
|
|
|
|
| 521 |
|
| 522 |
if bb_type == "resnet":
|
| 523 |
# ResNet uses layer1..layer4 stages; we unfreeze at block granularity.
|
| 524 |
+
# ResNet은 layer1..layer4 stage를 사용하며 block 단위로 unfreeze.
|
| 525 |
bb = model.backbone
|
| 526 |
for name in ("layer1", "layer2", "layer3", "layer4"):
|
| 527 |
if not hasattr(bb, name):
|
|
|
|
| 542 |
|
| 543 |
if bb_type == "efficientnet":
|
| 544 |
# EfficientNet in transformers exposes features; we unfreeze from the tail blocks.
|
| 545 |
+
# transformers EfficientNet은 features를 노출하며 뒤쪽 블록부터 unfreeze.
|
| 546 |
bb = model.backbone
|
| 547 |
if not hasattr(bb, "features"):
|
| 548 |
raise RuntimeError("Unexpected EfficientNet structure: missing features")
|
|
|
|
| 560 |
|
| 561 |
if bb_type in ("timm_densenet", "torchvision_densenet"):
|
| 562 |
# DenseNet exposes a .features module with named blocks; we unfreeze last n submodules.
|
| 563 |
+
# DenseNet은 .features 모듈에 블록들이 이름으로 존재하며 마지막 n개 서브모듈을 unfreeze.
|
| 564 |
bb = model.backbone
|
| 565 |
if not hasattr(bb, "features"):
|
| 566 |
raise RuntimeError("Unexpected DenseNet: missing features")
|
|
|
|
| 579 |
|
| 580 |
def _denselayers(db: nn.Module) -> List[nn.Module]:
|
| 581 |
# Dense blocks contain multiple DenseLayer children; we return them for fine-grained unfreezing.
|
| 582 |
+
# denseblock은 DenseLayer 자식들을 가지므로 세밀한 unfreeze를 위해 이를 반환.
|
| 583 |
return list(db.children())
|
| 584 |
|
| 585 |
blocks: List[nn.Module] = []
|
|
|
|
| 604 |
# register
|
| 605 |
# -------------------------
|
| 606 |
# Register for AutoModelForImageClassification so from_pretrained can resolve this custom class.
|
| 607 |
+
# from_pretrained가 이 커스텀 클래스를 해석할 수 있도록 AutoModelForImageClassification에 등록.
|
| 608 |
BackboneWithMLPHeadForImageClassification.register_for_auto_class("AutoModelForImageClassification")
|
models/microsoft__swin-tiny-patch4-window7-224/ds_proc.py
CHANGED
|
@@ -4,8 +4,8 @@
|
|
| 4 |
# src/ds_proc.py
|
| 5 |
|
| 6 |
# ============================================================
|
| 7 |
-
#
|
| 8 |
-
#
|
| 9 |
# ============================================================
|
| 10 |
|
| 11 |
from typing import Any
|
|
@@ -27,41 +27,38 @@ except ImportError:
|
|
| 27 |
class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
| 28 |
"""
|
| 29 |
This processor performs image preprocessing and outputs {"pixel_values": ...}.
|
| 30 |
-
이 processor는 이미지 전처리를 수행하고 {"pixel_values": ...}를
|
| 31 |
|
| 32 |
Key requirements:
|
| 33 |
핵심 요구사항:
|
| 34 |
|
| 35 |
1) save_pretrained() must produce a JSON-serializable preprocessor_config.json.
|
| 36 |
-
save_pretrained()는 JSON 직렬화 가능한 preprocessor_config.json을 생성해야
|
| 37 |
-
|
| 38 |
2) Runtime-only objects (delegate processor, timm/torchvision transforms) must NOT be serialized.
|
| 39 |
-
런타임 객체(delegate processor, timm/torchvision transform)는 절대 직렬화하면 안
|
| 40 |
-
|
| 41 |
3) Runtime objects are rebuilt at init/load time based on backbone meta.
|
| 42 |
-
런타임 객체는 backbone meta에 따라 init/load 시점에
|
| 43 |
-
|
| 44 |
4) For reproducibility, use_fast must be explicitly persisted and honored on load.
|
| 45 |
-
재현성을 위해 use_fast는 명시적으로 저장되고, 로드시 반드시 반영되어야
|
| 46 |
"""
|
| 47 |
|
| 48 |
# HF vision models conventionally expect "pixel_values" as the primary input key.
|
| 49 |
-
# HF vision 모델은 관례적으로 입력 키로 "pixel_values"를
|
| 50 |
model_input_names = ["pixel_values"]
|
| 51 |
|
| 52 |
def __init__(
|
| 53 |
self,
|
| 54 |
backbone_name_or_path: BackboneID,
|
| 55 |
-
is_training: bool = False,
|
| 56 |
use_fast: bool = False,
|
| 57 |
**kwargs,
|
| 58 |
):
|
| 59 |
# ImageProcessingMixin stores extra kwargs and manages auto_map metadata.
|
| 60 |
-
# ImageProcessingMixin은 추가 kwargs를 저장하고 auto_map 메타를
|
| 61 |
super().__init__(**kwargs)
|
| 62 |
|
| 63 |
# Enforce whitelist via BACKBONE_META to keep behavior stable.
|
| 64 |
-
# 동작 안정성을 위해 BACKBONE_META 기반 화이트리스트를
|
| 65 |
if backbone_name_or_path not in BACKBONE_META:
|
| 66 |
raise ValueError(
|
| 67 |
f"Unsupported backbone_name_or_path={backbone_name_or_path}. "
|
|
@@ -69,23 +66,23 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 69 |
)
|
| 70 |
|
| 71 |
# Serializable fields only: these should appear in preprocessor_config.json.
|
| 72 |
-
# 직렬화 가능한 필드만: 이 값들만 preprocessor_config.json에 들어가야
|
| 73 |
self.backbone_name_or_path = backbone_name_or_path
|
| 74 |
self.is_training = bool(is_training)
|
| 75 |
|
| 76 |
# Reproducibility switch for transformers processors.
|
| 77 |
-
# transformers processor의 fast/slow 선택을 재현 가능하게
|
| 78 |
self.use_fast = bool(use_fast)
|
| 79 |
|
| 80 |
# Runtime-only fields: must never be serialized.
|
| 81 |
-
# 런타임 전용 필드: 절대 직렬화되면 안
|
| 82 |
self._meta = None
|
| 83 |
-
self._delegate
|
| 84 |
-
self._timm_transform
|
| 85 |
self._torchvision_transform = None
|
| 86 |
|
| 87 |
# Build runtime objects according to backbone type.
|
| 88 |
-
# backbone type에 따라 런타임 객체를
|
| 89 |
self._build_runtime()
|
| 90 |
|
| 91 |
# ============================================================
|
|
@@ -95,13 +92,13 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 95 |
def _build_runtime(self):
|
| 96 |
"""
|
| 97 |
Build runtime delegate/transform based on BACKBONE_META["type"].
|
| 98 |
-
BACKBONE_META["type"]에 따라 런타임 delegate/transform을
|
| 99 |
"""
|
| 100 |
meta = BACKBONE_META[self.backbone_name_or_path]
|
| 101 |
self._meta = meta
|
| 102 |
|
| 103 |
# Always reset runtime fields before rebuilding.
|
| 104 |
-
# 재구성 전 런타임 필드는 항상
|
| 105 |
self._delegate = None
|
| 106 |
self._timm_transform = None
|
| 107 |
self._torchvision_transform = None
|
|
@@ -110,7 +107,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 110 |
|
| 111 |
if t == "timm_densenet":
|
| 112 |
# timm DenseNet uses timm.data transforms for ImageNet-style preprocessing.
|
| 113 |
-
# timm DenseNet은 ImageNet 전처리를 위해 timm.data transform을
|
| 114 |
self._timm_transform = self._build_timm_transform(
|
| 115 |
backbone_id=self.backbone_name_or_path,
|
| 116 |
is_training=self.is_training,
|
|
@@ -119,17 +116,17 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 119 |
|
| 120 |
if t == "torchvision_densenet":
|
| 121 |
# torchvision DenseNet requires torchvision-style preprocessing (resize/crop/tensor/normalize).
|
| 122 |
-
# torchvision DenseNet은 torchvision 스타일 전처리(resize/crop/tensor/normalize)가
|
| 123 |
self._torchvision_transform = self._build_torchvision_densenet_transform(
|
| 124 |
is_training=self.is_training
|
| 125 |
)
|
| 126 |
return
|
| 127 |
|
| 128 |
# Default: transformers backbone delegates to its official AutoImageProcessor.
|
| 129 |
-
# 기본: transformers 백본은 공식 AutoImageProcessor에
|
| 130 |
#
|
| 131 |
# IMPORTANT:
|
| 132 |
-
# - use_fast는 transformers 기본값 변경에 흔들리지 않도록 반드시 명시적으로
|
| 133 |
self._delegate = AutoImageProcessor.from_pretrained(
|
| 134 |
self.backbone_name_or_path,
|
| 135 |
use_fast=self.use_fast,
|
|
@@ -140,7 +137,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 140 |
def _build_timm_transform(*, backbone_id: str, is_training: bool):
|
| 141 |
"""
|
| 142 |
Create timm transform without storing non-serializable objects in config.
|
| 143 |
-
비직렬화 객체를 config에 저장하지 않고 timm transform을
|
| 144 |
"""
|
| 145 |
try:
|
| 146 |
import timm
|
|
@@ -151,20 +148,20 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 151 |
) from e
|
| 152 |
|
| 153 |
# We only need model metadata to resolve data config, so pretrained=False is preferred.
|
| 154 |
-
# data config 추출만 필요하므로 pretrained=False를 우선
|
| 155 |
m = timm.create_model(f"hf_hub:{backbone_id}", pretrained=False, num_classes=0)
|
| 156 |
dc = resolve_model_data_config(m)
|
| 157 |
|
| 158 |
# create_transform returns a torchvision-like callable that maps PIL -> torch.Tensor(C,H,W).
|
| 159 |
-
# create_transform은 PIL -> torch.Tensor(C,H,W)로 매핑하는 callable을
|
| 160 |
-
tfm = create_transform(**dc, is_training=is_training)
|
| 161 |
return tfm
|
| 162 |
|
| 163 |
@staticmethod
|
| 164 |
def _build_torchvision_densenet_transform(*, is_training: bool):
|
| 165 |
"""
|
| 166 |
Build torchvision preprocessing for DenseNet-121 (224 pipeline).
|
| 167 |
-
DenseNet-121용 torchvision 전처리(224 파이프라인)를
|
| 168 |
"""
|
| 169 |
try:
|
| 170 |
from torchvision import transforms
|
|
@@ -174,28 +171,29 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 174 |
) from e
|
| 175 |
|
| 176 |
# These are the standard ImageNet normalization stats used by torchvision weights.
|
| 177 |
-
# 이 값들은 torchvision weights가 사용하는 표준 ImageNet 정규화
|
| 178 |
mean = (0.485, 0.456, 0.406)
|
| 179 |
-
std
|
| 180 |
|
| 181 |
# Training pipeline typically uses RandomResizedCrop and horizontal flip.
|
| 182 |
-
# 학습 파이프라인은 보통 RandomResizedCrop과 좌우반전을
|
| 183 |
if is_training:
|
| 184 |
return transforms.Compose(
|
| 185 |
[
|
| 186 |
-
transforms.RandomResizedCrop(224),
|
| 187 |
-
transforms.RandomHorizontalFlip(p=0.5),
|
|
|
|
| 188 |
transforms.ToTensor(),
|
| 189 |
transforms.Normalize(mean=mean, std=std),
|
| 190 |
]
|
| 191 |
)
|
| 192 |
|
| 193 |
# Inference pipeline typically uses Resize(256) + CenterCrop(224).
|
| 194 |
-
# 추론 파이프라인은 보통 Resize(256) + CenterCrop(224)를
|
| 195 |
return transforms.Compose(
|
| 196 |
[
|
| 197 |
transforms.Resize(256),
|
| 198 |
-
transforms.CenterCrop(224),
|
| 199 |
transforms.ToTensor(),
|
| 200 |
transforms.Normalize(mean=mean, std=std),
|
| 201 |
]
|
|
@@ -208,24 +206,24 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 208 |
def to_dict(self) -> dict[str, Any]:
|
| 209 |
"""
|
| 210 |
Return a JSON-serializable dict for preprocessor_config.json.
|
| 211 |
-
preprocessor_config.json에 들어갈 JSON 직렬화 dict를
|
| 212 |
|
| 213 |
Important: do not leak runtime objects into the serialized dict.
|
| 214 |
-
중요: 런타임 객체가 직렬화 dict에 섞이면 안
|
| 215 |
"""
|
| 216 |
# ImageProcessingMixin.to_dict() adds metadata such as image_processor_type/auto_map.
|
| 217 |
# ImageProcessingMixin.to_dict()는 image_processor_type/auto_map 같은 메타를 추가합니다.
|
| 218 |
d = super().to_dict()
|
| 219 |
|
| 220 |
# Force minimal stable fields for long-term compatibility.
|
| 221 |
-
# 장기 호환을 위해 최소 안정 필드를
|
| 222 |
-
d["image_processor_type"]
|
| 223 |
d["backbone_name_or_path"] = self.backbone_name_or_path
|
| 224 |
d["is_training"] = self.is_training
|
| 225 |
-
d["use_fast"]
|
| 226 |
|
| 227 |
# Remove any runtime-only fields defensively.
|
| 228 |
-
# 런타임 전용 필드는 보수적으로
|
| 229 |
for key in ["_meta", "_delegate", "_timm_transform", "_torchvision_transform"]:
|
| 230 |
d.pop(key, None)
|
| 231 |
|
|
@@ -235,14 +233,14 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 235 |
def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs):
|
| 236 |
"""
|
| 237 |
Standard load path used by BaseImageProcessor / AutoImageProcessor.
|
| 238 |
-
BaseImageProcessor / AutoImageProcessor가 사용하는 표준 로드
|
| 239 |
"""
|
| 240 |
backbone = image_processor_dict.get("backbone_name_or_path", None)
|
| 241 |
if backbone is None:
|
| 242 |
raise ValueError("preprocessor_config.json missing key: backbone_name_or_path")
|
| 243 |
|
| 244 |
is_training = bool(image_processor_dict.get("is_training", False))
|
| 245 |
-
use_fast
|
| 246 |
|
| 247 |
return cls(
|
| 248 |
backbone_name_or_path=backbone,
|
|
@@ -255,20 +253,20 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 255 |
def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
|
| 256 |
"""
|
| 257 |
Fallback path if AutoImageProcessor calls class.from_pretrained directly.
|
| 258 |
-
AutoImageProcessor가 class.from_pretrained를 직접 호출하는 경우를 대비한
|
| 259 |
|
| 260 |
Strategy:
|
| 261 |
전략:
|
| 262 |
|
| 263 |
- Read config.json via AutoConfig and recover backbone_name_or_path.
|
| 264 |
-
AutoConfig로 config.json을 읽고 backbone_name_or_path를
|
| 265 |
"""
|
| 266 |
|
| 267 |
# is_training is runtime-only and should default to False for inference/serving.
|
| 268 |
-
# is_training은 런타임 전용이며 추론/서빙 기본값은 False
|
| 269 |
#
|
| 270 |
# IMPORTANT:
|
| 271 |
-
# - use_fast는 kwargs로 전달될 수 있으므로, 있으면
|
| 272 |
use_fast = bool(kwargs.pop("use_fast", False))
|
| 273 |
|
| 274 |
kwargs.pop("trust_remote_code", None)
|
|
@@ -289,7 +287,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 289 |
@staticmethod
|
| 290 |
def _ensure_list(images: Any) -> list[Any]:
|
| 291 |
# Normalize scalar image input to a list for uniform processing.
|
| 292 |
-
# 단일 입력을 리스트로 정규화하여 동일한 처리 경로를
|
| 293 |
if isinstance(images, (list, tuple)):
|
| 294 |
return list(images)
|
| 295 |
return [images]
|
|
@@ -297,7 +295,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 297 |
@staticmethod
|
| 298 |
def _to_pil_rgb(x: Any):
|
| 299 |
# Convert common image inputs into PIL RGB images.
|
| 300 |
-
# 일반적인 입력을 PIL RGB 이미지로
|
| 301 |
from PIL import Image as PILImage
|
| 302 |
|
| 303 |
if isinstance(x, PILImage.Image):
|
|
@@ -314,17 +312,17 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 314 |
) -> dict[str, Any]:
|
| 315 |
"""
|
| 316 |
Convert images into {"pixel_values": Tensor/ndarray}.
|
| 317 |
-
이미지를 {"pixel_values": Tensor/ndarray}로
|
| 318 |
"""
|
| 319 |
images = self._ensure_list(images)
|
| 320 |
|
| 321 |
# Rebuild runtime if needed (e.g., right after deserialization).
|
| 322 |
-
# 직렬화 복원 직후 등 런타임이 비어있을 수 있으므로
|
| 323 |
if (self._delegate is None) and (self._timm_transform is None) and (self._torchvision_transform is None):
|
| 324 |
self._build_runtime()
|
| 325 |
|
| 326 |
# timm path: PIL -> torch.Tensor(C,H,W) normalized float32.
|
| 327 |
-
# timm 경로: PIL -> torch.Tensor(C,H,W) 정규화 float32
|
| 328 |
if self._timm_transform is not None:
|
| 329 |
pv: list[torch.Tensor] = []
|
| 330 |
for im in images:
|
|
@@ -337,7 +335,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 337 |
return self._format_return(pixel_values, return_tensors)
|
| 338 |
|
| 339 |
# torchvision path: PIL -> torch.Tensor(C,H,W) normalized float32.
|
| 340 |
-
# torchvision 경로: PIL -> torch.Tensor(C,H,W) 정규화 float32
|
| 341 |
if self._torchvision_transform is not None:
|
| 342 |
pv: list[torch.Tensor] = []
|
| 343 |
for im in images:
|
|
@@ -350,7 +348,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 350 |
return self._format_return(pixel_values, return_tensors)
|
| 351 |
|
| 352 |
# transformers delegate path: rely on official processor behavior.
|
| 353 |
-
# transformers 위임 경로: 공식 processor 동작을 그대로
|
| 354 |
if self._delegate is None:
|
| 355 |
raise RuntimeError("Processor runtime not built: delegate is None and no transforms are available.")
|
| 356 |
|
|
@@ -360,7 +358,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 360 |
def _format_return(pixel_values: torch.Tensor, return_tensors: str | TensorType | None) -> dict[str, Any]:
|
| 361 |
"""
|
| 362 |
Format pixel_values according to return_tensors.
|
| 363 |
-
return_tensors에 맞춰 pixel_values 반환 포맷을
|
| 364 |
"""
|
| 365 |
if return_tensors is None or return_tensors in ("pt", TensorType.PYTORCH):
|
| 366 |
return {"pixel_values": pixel_values}
|
|
@@ -370,6 +368,6 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 370 |
|
| 371 |
|
| 372 |
# Register this processor for AutoImageProcessor resolution.
|
| 373 |
-
# AutoImageProcessor 해석을 위해 이 processor를
|
| 374 |
if __name__ != "__main__":
|
| 375 |
BackboneMLPHead224ImageProcessor.register_for_auto_class("AutoImageProcessor")
|
|
|
|
| 4 |
# src/ds_proc.py
|
| 5 |
|
| 6 |
# ============================================================
|
| 7 |
+
# ImageProcessor (AutoImageProcessor integration)
|
| 8 |
+
# ImageProcessor (AutoImageProcessor 연동)
|
| 9 |
# ============================================================
|
| 10 |
|
| 11 |
from typing import Any
|
|
|
|
| 27 |
class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
| 28 |
"""
|
| 29 |
This processor performs image preprocessing and outputs {"pixel_values": ...}.
|
| 30 |
+
이 processor는 이미지 전처리를 수행하고 {"pixel_values": ...}를 반환함.
|
| 31 |
|
| 32 |
Key requirements:
|
| 33 |
핵심 요구사항:
|
| 34 |
|
| 35 |
1) save_pretrained() must produce a JSON-serializable preprocessor_config.json.
|
| 36 |
+
save_pretrained()는 JSON 직렬화 가능한 preprocessor_config.json을 생성해야 함.
|
|
|
|
| 37 |
2) Runtime-only objects (delegate processor, timm/torchvision transforms) must NOT be serialized.
|
| 38 |
+
런타임 객체(delegate processor, timm/torchvision transform)는 절대 직렬화하면 안 됨.
|
|
|
|
| 39 |
3) Runtime objects are rebuilt at init/load time based on backbone meta.
|
| 40 |
+
런타임 객체는 backbone meta에 따라 init/load 시점에 재구성.
|
|
|
|
| 41 |
4) For reproducibility, use_fast must be explicitly persisted and honored on load.
|
| 42 |
+
재현성을 위해 use_fast는 명시적으로 저장되고, 로드시 반드시 반영되어야 함.
|
| 43 |
"""
|
| 44 |
|
| 45 |
# HF vision models conventionally expect "pixel_values" as the primary input key.
|
| 46 |
+
# HF vision 모델은 관례적으로 입력 키로 "pixel_values"를 기대.
|
| 47 |
model_input_names = ["pixel_values"]
|
| 48 |
|
| 49 |
def __init__(
|
| 50 |
self,
|
| 51 |
backbone_name_or_path: BackboneID,
|
| 52 |
+
is_training: bool = False, # timm 에서 data augmentation 용.
|
| 53 |
use_fast: bool = False,
|
| 54 |
**kwargs,
|
| 55 |
):
|
| 56 |
# ImageProcessingMixin stores extra kwargs and manages auto_map metadata.
|
| 57 |
+
# ImageProcessingMixin은 추가 kwargs를 저장하고 auto_map 메타를 관리.
|
| 58 |
super().__init__(**kwargs)
|
| 59 |
|
| 60 |
# Enforce whitelist via BACKBONE_META to keep behavior stable.
|
| 61 |
+
# 동작 안정성을 위해 BACKBONE_META 기반 화이트리스트를 강제. - fast fail
|
| 62 |
if backbone_name_or_path not in BACKBONE_META:
|
| 63 |
raise ValueError(
|
| 64 |
f"Unsupported backbone_name_or_path={backbone_name_or_path}. "
|
|
|
|
| 66 |
)
|
| 67 |
|
| 68 |
# Serializable fields only: these should appear in preprocessor_config.json.
|
| 69 |
+
# 직렬화 가능한 필드만: 이 값들만 preprocessor_config.json에 들어가야 함
|
| 70 |
self.backbone_name_or_path = backbone_name_or_path
|
| 71 |
self.is_training = bool(is_training)
|
| 72 |
|
| 73 |
# Reproducibility switch for transformers processors.
|
| 74 |
+
# transformers processor의 fast/slow 선택을 재현 가능하게 고정.
|
| 75 |
self.use_fast = bool(use_fast)
|
| 76 |
|
| 77 |
# Runtime-only fields: must never be serialized.
|
| 78 |
+
# 런타임 전용 필드: 절대 직렬화되면 안 됨.
|
| 79 |
self._meta = None
|
| 80 |
+
self._delegate = None
|
| 81 |
+
self._timm_transform = None
|
| 82 |
self._torchvision_transform = None
|
| 83 |
|
| 84 |
# Build runtime objects according to backbone type.
|
| 85 |
+
# backbone type에 따라 런타임 객체를 구성.
|
| 86 |
self._build_runtime()
|
| 87 |
|
| 88 |
# ============================================================
|
|
|
|
| 92 |
def _build_runtime(self):
|
| 93 |
"""
|
| 94 |
Build runtime delegate/transform based on BACKBONE_META["type"].
|
| 95 |
+
BACKBONE_META["type"]에 따라 런타임 delegate/transform을 구성.
|
| 96 |
"""
|
| 97 |
meta = BACKBONE_META[self.backbone_name_or_path]
|
| 98 |
self._meta = meta
|
| 99 |
|
| 100 |
# Always reset runtime fields before rebuilding.
|
| 101 |
+
# 재구성 전 런타임 필드는 항상 초기화.
|
| 102 |
self._delegate = None
|
| 103 |
self._timm_transform = None
|
| 104 |
self._torchvision_transform = None
|
|
|
|
| 107 |
|
| 108 |
if t == "timm_densenet":
|
| 109 |
# timm DenseNet uses timm.data transforms for ImageNet-style preprocessing.
|
| 110 |
+
# timm DenseNet은 ImageNet 전처리를 위해 timm.data transform을 사용.
|
| 111 |
self._timm_transform = self._build_timm_transform(
|
| 112 |
backbone_id=self.backbone_name_or_path,
|
| 113 |
is_training=self.is_training,
|
|
|
|
| 116 |
|
| 117 |
if t == "torchvision_densenet":
|
| 118 |
# torchvision DenseNet requires torchvision-style preprocessing (resize/crop/tensor/normalize).
|
| 119 |
+
# torchvision DenseNet은 torchvision 스타일 전처리(resize/crop/tensor/normalize)가 필요.
|
| 120 |
self._torchvision_transform = self._build_torchvision_densenet_transform(
|
| 121 |
is_training=self.is_training
|
| 122 |
)
|
| 123 |
return
|
| 124 |
|
| 125 |
# Default: transformers backbone delegates to its official AutoImageProcessor.
|
| 126 |
+
# 기본: transformers 백본은 공식 AutoImageProcessor에 위임.
|
| 127 |
#
|
| 128 |
# IMPORTANT:
|
| 129 |
+
# - use_fast는 transformers 기본값 변경에 흔들리지 않도록 반드시 명시적으로 전달.
|
| 130 |
self._delegate = AutoImageProcessor.from_pretrained(
|
| 131 |
self.backbone_name_or_path,
|
| 132 |
use_fast=self.use_fast,
|
|
|
|
| 137 |
def _build_timm_transform(*, backbone_id: str, is_training: bool):
|
| 138 |
"""
|
| 139 |
Create timm transform without storing non-serializable objects in config.
|
| 140 |
+
비직렬화 객체를 config에 저장하지 않고 timm transform을 생성.
|
| 141 |
"""
|
| 142 |
try:
|
| 143 |
import timm
|
|
|
|
| 148 |
) from e
|
| 149 |
|
| 150 |
# We only need model metadata to resolve data config, so pretrained=False is preferred.
|
| 151 |
+
# data config 추출만 필요하므로 pretrained=False를 우선 사용.
|
| 152 |
m = timm.create_model(f"hf_hub:{backbone_id}", pretrained=False, num_classes=0)
|
| 153 |
dc = resolve_model_data_config(m)
|
| 154 |
|
| 155 |
# create_transform returns a torchvision-like callable that maps PIL -> torch.Tensor(C,H,W).
|
| 156 |
+
# create_transform은 PIL -> torch.Tensor(C,H,W)로 매핑하는 callable을 반환.
|
| 157 |
+
tfm = create_transform(**dc, is_training=is_training) # is_training :Data Aug.
|
| 158 |
return tfm
|
| 159 |
|
| 160 |
@staticmethod
|
| 161 |
def _build_torchvision_densenet_transform(*, is_training: bool):
|
| 162 |
"""
|
| 163 |
Build torchvision preprocessing for DenseNet-121 (224 pipeline).
|
| 164 |
+
DenseNet-121용 torchvision 전처리(224 파이프라인)를 구성.
|
| 165 |
"""
|
| 166 |
try:
|
| 167 |
from torchvision import transforms
|
|
|
|
| 171 |
) from e
|
| 172 |
|
| 173 |
# These are the standard ImageNet normalization stats used by torchvision weights.
|
| 174 |
+
# 이 값들은 torchvision weights가 사용하는 표준 ImageNet 정규화 통계.
|
| 175 |
mean = (0.485, 0.456, 0.406)
|
| 176 |
+
std = (0.229, 0.224, 0.225)
|
| 177 |
|
| 178 |
# Training pipeline typically uses RandomResizedCrop and horizontal flip.
|
| 179 |
+
# 학습 파이프라인은 보통 RandomResizedCrop과 좌우반전을 사용.
|
| 180 |
if is_training:
|
| 181 |
return transforms.Compose(
|
| 182 |
[
|
| 183 |
+
# transforms.RandomResizedCrop(224),
|
| 184 |
+
# transforms.RandomHorizontalFlip(p=0.5),
|
| 185 |
+
transforms.Resize(224),
|
| 186 |
transforms.ToTensor(),
|
| 187 |
transforms.Normalize(mean=mean, std=std),
|
| 188 |
]
|
| 189 |
)
|
| 190 |
|
| 191 |
# Inference pipeline typically uses Resize(256) + CenterCrop(224).
|
| 192 |
+
# 추론 파이프라인은 보통 Resize(256) + CenterCrop(224)를 사용.
|
| 193 |
return transforms.Compose(
|
| 194 |
[
|
| 195 |
transforms.Resize(256),
|
| 196 |
+
# transforms.CenterCrop(224),
|
| 197 |
transforms.ToTensor(),
|
| 198 |
transforms.Normalize(mean=mean, std=std),
|
| 199 |
]
|
|
|
|
| 206 |
def to_dict(self) -> dict[str, Any]:
|
| 207 |
"""
|
| 208 |
Return a JSON-serializable dict for preprocessor_config.json.
|
| 209 |
+
preprocessor_config.json에 들어갈 JSON 직렬화 dict를 반환.
|
| 210 |
|
| 211 |
Important: do not leak runtime objects into the serialized dict.
|
| 212 |
+
중요: 런타임 객체가 직렬화 dict에 섞이면 안 됨.
|
| 213 |
"""
|
| 214 |
# ImageProcessingMixin.to_dict() adds metadata such as image_processor_type/auto_map.
|
| 215 |
# ImageProcessingMixin.to_dict()는 image_processor_type/auto_map 같은 메타를 추가합니다.
|
| 216 |
d = super().to_dict()
|
| 217 |
|
| 218 |
# Force minimal stable fields for long-term compatibility.
|
| 219 |
+
# 장기 호환을 위해 최소 안정 필드를 강제로 지정.
|
| 220 |
+
d["image_processor_type"] = self.__class__.__name__
|
| 221 |
d["backbone_name_or_path"] = self.backbone_name_or_path
|
| 222 |
d["is_training"] = self.is_training
|
| 223 |
+
d["use_fast"] = self.use_fast
|
| 224 |
|
| 225 |
# Remove any runtime-only fields defensively.
|
| 226 |
+
# 런타임 전용 필드는 보수적으로 제거.
|
| 227 |
for key in ["_meta", "_delegate", "_timm_transform", "_torchvision_transform"]:
|
| 228 |
d.pop(key, None)
|
| 229 |
|
|
|
|
| 233 |
def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs):
|
| 234 |
"""
|
| 235 |
Standard load path used by BaseImageProcessor / AutoImageProcessor.
|
| 236 |
+
BaseImageProcessor / AutoImageProcessor가 사용하는 표준 로드 경로임.
|
| 237 |
"""
|
| 238 |
backbone = image_processor_dict.get("backbone_name_or_path", None)
|
| 239 |
if backbone is None:
|
| 240 |
raise ValueError("preprocessor_config.json missing key: backbone_name_or_path")
|
| 241 |
|
| 242 |
is_training = bool(image_processor_dict.get("is_training", False))
|
| 243 |
+
use_fast = bool(image_processor_dict.get("use_fast", False))
|
| 244 |
|
| 245 |
return cls(
|
| 246 |
backbone_name_or_path=backbone,
|
|
|
|
| 253 |
def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
|
| 254 |
"""
|
| 255 |
Fallback path if AutoImageProcessor calls class.from_pretrained directly.
|
| 256 |
+
AutoImageProcessor가 class.from_pretrained를 직접 호출하는 경우를 대비한 메서드.
|
| 257 |
|
| 258 |
Strategy:
|
| 259 |
전략:
|
| 260 |
|
| 261 |
- Read config.json via AutoConfig and recover backbone_name_or_path.
|
| 262 |
+
AutoConfig로 config.json을 읽고 backbone_name_or_path를 복구.
|
| 263 |
"""
|
| 264 |
|
| 265 |
# is_training is runtime-only and should default to False for inference/serving.
|
| 266 |
+
# is_training은 런타임 전용이며 추론/서빙 기본값은 False 임.
|
| 267 |
#
|
| 268 |
# IMPORTANT:
|
| 269 |
+
# - use_fast는 kwargs로 전달될 수 있으므로, 있으면 반영.
|
| 270 |
use_fast = bool(kwargs.pop("use_fast", False))
|
| 271 |
|
| 272 |
kwargs.pop("trust_remote_code", None)
|
|
|
|
| 287 |
@staticmethod
|
| 288 |
def _ensure_list(images: Any) -> list[Any]:
|
| 289 |
# Normalize scalar image input to a list for uniform processing.
|
| 290 |
+
# 단일 입력을 리스트로 정규화하여 동일한 처리 경로를 사용.
|
| 291 |
if isinstance(images, (list, tuple)):
|
| 292 |
return list(images)
|
| 293 |
return [images]
|
|
|
|
| 295 |
@staticmethod
|
| 296 |
def _to_pil_rgb(x: Any):
|
| 297 |
# Convert common image inputs into PIL RGB images.
|
| 298 |
+
# 일반적인 입력을 PIL RGB 이미지로 변환.
|
| 299 |
from PIL import Image as PILImage
|
| 300 |
|
| 301 |
if isinstance(x, PILImage.Image):
|
|
|
|
| 312 |
) -> dict[str, Any]:
|
| 313 |
"""
|
| 314 |
Convert images into {"pixel_values": Tensor/ndarray}.
|
| 315 |
+
이미지를 {"pixel_values": Tensor/ndarray}로 변환.
|
| 316 |
"""
|
| 317 |
images = self._ensure_list(images)
|
| 318 |
|
| 319 |
# Rebuild runtime if needed (e.g., right after deserialization).
|
| 320 |
+
# 직렬화 복원 직후 등 런타임이 비어있을 수 있으므로 재구성.
|
| 321 |
if (self._delegate is None) and (self._timm_transform is None) and (self._torchvision_transform is None):
|
| 322 |
self._build_runtime()
|
| 323 |
|
| 324 |
# timm path: PIL -> torch.Tensor(C,H,W) normalized float32.
|
| 325 |
+
# timm 경로: PIL -> torch.Tensor(C,H,W) 정규화 float32.
|
| 326 |
if self._timm_transform is not None:
|
| 327 |
pv: list[torch.Tensor] = []
|
| 328 |
for im in images:
|
|
|
|
| 335 |
return self._format_return(pixel_values, return_tensors)
|
| 336 |
|
| 337 |
# torchvision path: PIL -> torch.Tensor(C,H,W) normalized float32.
|
| 338 |
+
# torchvision 경로: PIL -> torch.Tensor(C,H,W) 정규화 float32.
|
| 339 |
if self._torchvision_transform is not None:
|
| 340 |
pv: list[torch.Tensor] = []
|
| 341 |
for im in images:
|
|
|
|
| 348 |
return self._format_return(pixel_values, return_tensors)
|
| 349 |
|
| 350 |
# transformers delegate path: rely on official processor behavior.
|
| 351 |
+
# transformers 위임 경로: 공식 processor 동작을 그대로 사용.
|
| 352 |
if self._delegate is None:
|
| 353 |
raise RuntimeError("Processor runtime not built: delegate is None and no transforms are available.")
|
| 354 |
|
|
|
|
| 358 |
def _format_return(pixel_values: torch.Tensor, return_tensors: str | TensorType | None) -> dict[str, Any]:
|
| 359 |
"""
|
| 360 |
Format pixel_values according to return_tensors.
|
| 361 |
+
return_tensors에 맞춰 pixel_values 반환 포맷을 변환.
|
| 362 |
"""
|
| 363 |
if return_tensors is None or return_tensors in ("pt", TensorType.PYTORCH):
|
| 364 |
return {"pixel_values": pixel_values}
|
|
|
|
| 368 |
|
| 369 |
|
| 370 |
# Register this processor for AutoImageProcessor resolution.
|
| 371 |
+
# AutoImageProcessor 해석을 위해 이 processor를 등록.
|
| 372 |
if __name__ != "__main__":
|
| 373 |
BackboneMLPHead224ImageProcessor.register_for_auto_class("AutoImageProcessor")
|
models/microsoft__swin-tiny-patch4-window7-224/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 111128348
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:45ec1d099f3a94683a6ded551823d596bc3cf56dd7d801adee5dc10c5b155e52
|
| 3 |
size 111128348
|
models/timm__densenet121.tv_in1k/config.json
CHANGED
|
@@ -24,7 +24,7 @@
|
|
| 24 |
"num_labels": 3,
|
| 25 |
"transformers_version": "5.1.0",
|
| 26 |
"ds_provenance": {
|
| 27 |
-
"created_at": "
|
| 28 |
"repo_id": "dsaint31/bb_mlp_224",
|
| 29 |
"subdir": "models/timm__densenet121.tv_in1k",
|
| 30 |
"wrapper_class": "BackboneWithMLPHeadForImageClassification",
|
|
|
|
| 24 |
"num_labels": 3,
|
| 25 |
"transformers_version": "5.1.0",
|
| 26 |
"ds_provenance": {
|
| 27 |
+
"created_at": "20260212_202546",
|
| 28 |
"repo_id": "dsaint31/bb_mlp_224",
|
| 29 |
"subdir": "models/timm__densenet121.tv_in1k",
|
| 30 |
"wrapper_class": "BackboneWithMLPHeadForImageClassification",
|
models/timm__densenet121.tv_in1k/ds_model.py
CHANGED
|
@@ -94,14 +94,14 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 94 |
|
| 95 |
def __init__(self, config: BackboneMLPHeadConfig):
|
| 96 |
# PreTrainedModel expects a config object and stores it internally.
|
| 97 |
-
# PreTrainedModel은 config 객체를 받아 내부에
|
| 98 |
super().__init__(config)
|
| 99 |
|
| 100 |
# Fail-fast: the model is not meant to be instantiated without a valid backbone id.
|
| 101 |
-
# fail-fast: 유효한 backbone id 없이 모델을 만드는 사용 시나리오는 허용하지
|
| 102 |
#
|
| 103 |
# Note: Transformers may create configs with no args, but models are conventionally created with configs.
|
| 104 |
-
# 참고: Transformers는 config 무인자 생성이 있을 수 있으나, 모델은 관례적으로 config를 받아
|
| 105 |
if config.backbone_name_or_path is None:
|
| 106 |
raise ValueError(
|
| 107 |
"config.backbone_name_or_path is None. "
|
|
@@ -109,10 +109,10 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 109 |
)
|
| 110 |
|
| 111 |
# Fail-fast: training/inference requires a positive number of labels.
|
| 112 |
-
# fail-fast: 학습/추론은 num_labels가 양수여야
|
| 113 |
#
|
| 114 |
# Config may exist in a minimal form for internal serialization paths, but the model should not.
|
| 115 |
-
# config는 내부 직렬화 경로에서 최소 형태로 존재할 수 있으나 모델은
|
| 116 |
if int(getattr(config, "num_labels", 0)) <= 0:
|
| 117 |
raise ValueError(
|
| 118 |
f"config.num_labels must be > 0, got {getattr(config, 'num_labels', None)}. "
|
|
@@ -120,17 +120,17 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 120 |
)
|
| 121 |
|
| 122 |
# Meta is a single source of truth for extraction and fine-tuning rules.
|
| 123 |
-
# meta는 feature 추출 및 미세조정 규칙의 단일
|
| 124 |
-
|
| 125 |
# Prefer config.backbone_meta to keep Hub runtime self-contained.
|
| 126 |
self._meta = _resolve_backbone_meta(config, fallback_table=BACKBONE_META)
|
| 127 |
|
| 128 |
# Backbone skeleton is always created without pretrained weights.
|
| 129 |
-
# backbone skeleton은 항상 pretrained weight 없이
|
| 130 |
self.backbone = self._build_backbone_skeleton(config.backbone_name_or_path)
|
| 131 |
|
| 132 |
# Head shape is driven by meta feat_dim and config.num_labels.
|
| 133 |
-
# head shape은 meta의 feat_dim과 config.num_labels로
|
| 134 |
self.classifier = MLPHead(
|
| 135 |
in_dim=int(self._meta["feat_dim"]),
|
| 136 |
num_labels=int(config.num_labels),
|
|
@@ -139,16 +139,20 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 139 |
)
|
| 140 |
|
| 141 |
# HF initialization hook, but we override init_weights to initialize head-only.
|
| 142 |
-
# HF 초기화 훅이지만 init_weights를 override하여 head만
|
| 143 |
self.post_init()
|
| 144 |
|
| 145 |
def init_weights(self):
|
| 146 |
"""
|
| 147 |
Initialize only the head to avoid touching the backbone skeleton.
|
| 148 |
-
backbone skeleton을 건드리지 않기 위해 head만
|
| 149 |
|
| 150 |
HF's default init may traverse the entire module tree, which is undesirable here.
|
| 151 |
-
HF 기본 init은 전체 모듈 트리를 순회할 수 있어
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
"""
|
| 153 |
if getattr(self, "classifier", None) is not None:
|
| 154 |
self.classifier.apply(self._init_weights)
|
|
@@ -160,7 +164,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 160 |
# ----------------------------
|
| 161 |
def _build_backbone_skeleton(self, backbone_id: str) -> nn.Module:
|
| 162 |
# Meta decides which loader path to use.
|
| 163 |
-
# meta가 어떤 로더 경로를 사용할지
|
| 164 |
meta = self._meta if backbone_id == self.config.backbone_name_or_path else BACKBONE_META.get(backbone_id)
|
| 165 |
if meta is None:
|
| 166 |
raise KeyError(f"Unknown backbone_id={backbone_id}. Provide backbone_meta in config or extend BACKBONE_META.")
|
|
@@ -174,14 +178,14 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 174 |
return self._build_torchvision_densenet_skeleton(backbone_id)
|
| 175 |
|
| 176 |
# For transformers backbones: build a random-weight skeleton from config only.
|
| 177 |
-
# transformers 백본: config로부터 랜덤 초기화 skeleton만
|
| 178 |
bb_cfg = AutoConfig.from_pretrained(backbone_id)
|
| 179 |
return AutoModel.from_config(bb_cfg)
|
| 180 |
|
| 181 |
@staticmethod
|
| 182 |
def _build_timm_densenet_skeleton(hf_repo_id: str) -> nn.Module:
|
| 183 |
# timm is an optional dependency and should be imported lazily.
|
| 184 |
-
# timm은 옵션 의존성이므로 지연 import
|
| 185 |
try:
|
| 186 |
import timm
|
| 187 |
except Exception as e:
|
|
@@ -190,7 +194,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 190 |
) from e
|
| 191 |
|
| 192 |
# Build structure only (pretrained=False) and remove classifier head (num_classes=0).
|
| 193 |
-
# 구조만 생성(pretrained=False)하고 분류기 head는 제거(num_classes=0)
|
| 194 |
return timm.create_model(
|
| 195 |
f"hf_hub:{hf_repo_id}",
|
| 196 |
pretrained=False,
|
|
@@ -200,12 +204,12 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 200 |
@staticmethod
|
| 201 |
def _build_torchvision_densenet_skeleton(model_id: str) -> nn.Module:
|
| 202 |
# This project intentionally supports only torchvision/densenet121 in the 224 whitelist.
|
| 203 |
-
# 이 프로젝트는 224 화이트리스트에서 torchvision/densenet121만 의도적으로
|
| 204 |
if model_id != "torchvision/densenet121":
|
| 205 |
raise ValueError(f"Unsupported torchvision DenseNet id (224 whitelist only): {model_id}")
|
| 206 |
|
| 207 |
# Build structure only (weights=None) to avoid implicit pretrained loading.
|
| 208 |
-
# implicit pretrained 로드를 피하기 위해 구조만 생성(weights=None)
|
| 209 |
m = tv_models.densenet121(weights=None)
|
| 210 |
return m
|
| 211 |
|
|
@@ -222,10 +226,10 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 222 |
):
|
| 223 |
"""
|
| 224 |
Fresh-start only: inject pretrained backbone weights into the skeleton.
|
| 225 |
-
fresh-start 전용: skeleton backbone에 pretrained 가중치를
|
| 226 |
|
| 227 |
Do NOT call this after from_pretrained() because it would overwrite checkpoint weights.
|
| 228 |
-
from_pretrained() 이후 호출하면 체크포인트 가중치를 덮어쓰므로
|
| 229 |
"""
|
| 230 |
bb = self.config.backbone_name_or_path
|
| 231 |
meta = self._meta
|
|
@@ -240,7 +244,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 240 |
return
|
| 241 |
|
| 242 |
# For transformers backbones, load a reference pretrained model and copy weights into our skeleton.
|
| 243 |
-
# transformers 백본은 reference pretrained 모델을 로드한 뒤 skeleton에 가중치를
|
| 244 |
ref = AutoModel.from_pretrained(
|
| 245 |
bb,
|
| 246 |
low_cpu_mem_usage=low_cpu_mem_usage,
|
|
@@ -248,18 +252,18 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 248 |
)
|
| 249 |
|
| 250 |
# strict=False is used to tolerate harmless key differences across minor versions.
|
| 251 |
-
# strict=False는 마이너 버전 차이로 인한 무해한 키 차이를 허용하기 위해
|
| 252 |
self.backbone.load_state_dict(ref.state_dict(), strict=False)
|
| 253 |
del ref
|
| 254 |
|
| 255 |
@torch.no_grad()
|
| 256 |
def _load_timm_pretrained_into_skeleton_(self, hf_repo_id: str):
|
| 257 |
# timm must be present for timm backbones.
|
| 258 |
-
# timm
|
| 259 |
import timm
|
| 260 |
|
| 261 |
# Create a pretrained reference model and copy its weights strictly.
|
| 262 |
-
# pretrained reference 모델을 만들고 가중치를 strict하게
|
| 263 |
ref = timm.create_model(
|
| 264 |
f"hf_hub:{hf_repo_id}",
|
| 265 |
pretrained=True,
|
|
@@ -272,12 +276,12 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 272 |
@torch.no_grad()
|
| 273 |
def _load_torchvision_pretrained_into_skeleton_(self, model_id: str):
|
| 274 |
# This project intentionally supports only torchvision/densenet121 in the 224 whitelist.
|
| 275 |
-
# 이 프로젝트는 224 화이트리스트에서 torchvision/densenet121만
|
| 276 |
if model_id != "torchvision/densenet121":
|
| 277 |
raise ValueError(f"Unsupported torchvision DenseNet id (224 whitelist only): {model_id}")
|
| 278 |
|
| 279 |
# Use torchvision's default pretrained weights for densenet121.
|
| 280 |
-
# torchvision의 densenet121 기본 pretrained weights를
|
| 281 |
ref = tv_models.densenet121(weights=tv_models.DenseNet121_Weights.DEFAULT).eval()
|
| 282 |
|
| 283 |
self.backbone.load_state_dict(ref.state_dict(), strict=True)
|
|
@@ -290,7 +294,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 290 |
@staticmethod
|
| 291 |
def _pool_or_gap(outputs) -> torch.Tensor:
|
| 292 |
# Some transformers vision CNNs provide pooler_output explicitly.
|
| 293 |
-
# 일부 transformers vision CNN은 pooler_output을 명시적으로
|
| 294 |
if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
|
| 295 |
x = outputs.pooler_output
|
| 296 |
if x.dim() == 2:
|
|
@@ -300,7 +304,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 300 |
raise RuntimeError(f"Unexpected pooler_output shape: {tuple(x.shape)}")
|
| 301 |
|
| 302 |
# Otherwise we expect a CNN-style last_hidden_state=(B,C,H,W) and apply GAP.
|
| 303 |
-
# 그렇지 않으면 CNN 스타일 last_hidden_state=(B,C,H,W)를 기대하고 GAP을
|
| 304 |
x = outputs.last_hidden_state
|
| 305 |
if x.dim() == 4:
|
| 306 |
return x.mean(dim=(2, 3))
|
|
@@ -312,29 +316,29 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 312 |
|
| 313 |
def _extract_features(self, outputs, pixel_values: Optional[torch.Tensor] = None) -> torch.Tensor:
|
| 314 |
# Feature rule is defined by BACKBONE_META and must remain stable across saves/loads.
|
| 315 |
-
# feature 규칙은 BACKBONE_META로 정의되며 저장/로드 간
|
| 316 |
rule = self._meta["feat_rule"]
|
| 317 |
|
| 318 |
if rule == "cls":
|
| 319 |
# ViT-style: use CLS token embedding from last_hidden_state.
|
| 320 |
-
# ViT 스타일: last_hidden_state에서 CLS 토큰 임베딩을
|
| 321 |
return outputs.last_hidden_state[:, 0, :]
|
| 322 |
|
| 323 |
if rule == "pool_or_mean":
|
| 324 |
# Swin-style: prefer pooler_output if present, else mean-pool over tokens.
|
| 325 |
-
# Swin 스타일: pooler_output이 있으면 우선 사용하고, 없으면 토큰 평균 풀링을
|
| 326 |
if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
|
| 327 |
return outputs.pooler_output
|
| 328 |
return outputs.last_hidden_state.mean(dim=1)
|
| 329 |
|
| 330 |
if rule == "pool_or_gap":
|
| 331 |
# CNN-style: use pooler_output if present, else GAP over spatial dims.
|
| 332 |
-
# CNN 스타일: pooler_output이 있으면 사용하고, 없으면 공간 차원 GAP을
|
| 333 |
return self._pool_or_gap(outputs)
|
| 334 |
|
| 335 |
if rule == "timm_gap":
|
| 336 |
# timm forward_features returns a feature map (B,C,H,W) which we GAP to (B,C).
|
| 337 |
-
# timm forward_features는 (B,C,H,W) feature map을 반환하며 이를 GAP으로 (B,C)로
|
| 338 |
if not isinstance(outputs, torch.Tensor):
|
| 339 |
raise TypeError(f"timm_gap expects Tensor features, got {type(outputs)}")
|
| 340 |
if outputs.dim() != 4:
|
|
@@ -343,7 +347,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 343 |
|
| 344 |
if rule == "torchvision_densenet_gap":
|
| 345 |
# torchvision DenseNet features are feature maps (B,C,H,W) and require GAP.
|
| 346 |
-
# torchvision DenseNet features는 (B,C,H,W) feature map이며 GAP이
|
| 347 |
if not isinstance(outputs, torch.Tensor):
|
| 348 |
raise TypeError(f"torchvision_densenet_gap expects Tensor, got {type(outputs)}")
|
| 349 |
if outputs.dim() != 4:
|
|
@@ -362,7 +366,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 362 |
**kwargs,
|
| 363 |
):
|
| 364 |
# Type decides the backbone forward path and output format.
|
| 365 |
-
# type이 backbone forward 경로 및 출력 포맷을
|
| 366 |
t = self._meta["type"]
|
| 367 |
|
| 368 |
if t == "timm_densenet":
|
|
@@ -394,7 +398,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 394 |
|
| 395 |
else:
|
| 396 |
# Transformers vision models are called with pixel_values and return ModelOutput.
|
| 397 |
-
# transformers vision 모델은 pixel_values로 호출되며 ModelOutput을
|
| 398 |
outputs = self.backbone(
|
| 399 |
pixel_values=pixel_values,
|
| 400 |
output_attentions=output_attentions,
|
|
@@ -407,13 +411,13 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 407 |
attentions = getattr(outputs, "attentions", None)
|
| 408 |
|
| 409 |
# Classifier consumes (B, feat_dim) and returns logits (B, num_labels).
|
| 410 |
-
# classifier는 (B, feat_dim)을 받아 logits (B, num_labels)를
|
| 411 |
logits = self.classifier(feats)
|
| 412 |
|
| 413 |
loss = None
|
| 414 |
if labels is not None:
|
| 415 |
# Cross entropy expects labels as class indices in [0, num_labels).
|
| 416 |
-
# cross entropy는 labels가 [0, num_labels) 범위의 class index이길
|
| 417 |
loss = F.cross_entropy(logits, labels)
|
| 418 |
|
| 419 |
if not return_dict:
|
|
@@ -434,14 +438,14 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 434 |
# ============================================================
|
| 435 |
def _set_requires_grad(module: nn.Module, flag: bool):
|
| 436 |
# Toggle requires_grad for all parameters in a module.
|
| 437 |
-
# 모듈의 모든 파라미터에 대해 requires_grad를
|
| 438 |
for p in module.parameters():
|
| 439 |
p.requires_grad = flag
|
| 440 |
|
| 441 |
|
| 442 |
def set_bn_eval(module: nn.Module):
|
| 443 |
# Put BatchNorm layers into eval mode to freeze running stats.
|
| 444 |
-
# BatchNorm 레이어를 eval 모드로 두어 running stats를
|
| 445 |
for m in module.modules():
|
| 446 |
if isinstance(m, (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.SyncBatchNorm)):
|
| 447 |
m.eval()
|
|
@@ -449,7 +453,7 @@ def set_bn_eval(module: nn.Module):
|
|
| 449 |
|
| 450 |
def freeze_backbone(model: BackboneWithMLPHeadForImageClassification, freeze_bn: bool = True):
|
| 451 |
# Stage1: freeze backbone and train only the head.
|
| 452 |
-
# stage1: backbone을 freeze하고 head만
|
| 453 |
_set_requires_grad(model.backbone, False)
|
| 454 |
_set_requires_grad(model.classifier, True)
|
| 455 |
|
|
@@ -460,7 +464,7 @@ def freeze_backbone(model: BackboneWithMLPHeadForImageClassification, freeze_bn:
|
|
| 460 |
|
| 461 |
def finetune_train_mode(model: BackboneWithMLPHeadForImageClassification, keep_bn_eval: bool = True):
|
| 462 |
# Stage2: train mode, optionally keeping BN layers in eval for stability.
|
| 463 |
-
# stage2: train 모드로 두되 안정성을 위해 BN을 eval로 유지할 수
|
| 464 |
model.train()
|
| 465 |
meta = getattr(model, "_meta", None) or getattr(model.config, "backbone_meta", None)
|
| 466 |
if keep_bn_eval and meta.get("has_bn", False):
|
|
@@ -469,7 +473,7 @@ def finetune_train_mode(model: BackboneWithMLPHeadForImageClassification, keep_b
|
|
| 469 |
|
| 470 |
def trainable_summary(model: nn.Module):
|
| 471 |
# Print a compact summary of trainable parameters.
|
| 472 |
-
# 학습 가능 파라미터 요약을 간단히
|
| 473 |
total = sum(p.numel() for p in model.parameters())
|
| 474 |
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
| 475 |
ratio = trainable / total if total > 0 else 0.0
|
|
@@ -483,7 +487,7 @@ def unfreeze_last_stage(
|
|
| 483 |
keep_bn_eval: bool = True,
|
| 484 |
):
|
| 485 |
# This utility implements BACKBONE_META['unfreeze']=="last_n" across supported backbones.
|
| 486 |
-
# 이 유틸은 지원 백본들에 대해 BACKBONE_META['unfreeze']=="last_n"을
|
| 487 |
freeze_backbone(model, freeze_bn=keep_bn_eval)
|
| 488 |
|
| 489 |
n = int(last_n)
|
|
@@ -498,7 +502,7 @@ def unfreeze_last_stage(
|
|
| 498 |
|
| 499 |
if bb_type == "vit":
|
| 500 |
# ViT blocks live under backbone.encoder.layer in the transformers implementation.
|
| 501 |
-
# ViT 블록은 transformers 구현에서 backbone.encoder.layer 아래에
|
| 502 |
blocks = list(model.backbone.encoder.layer)
|
| 503 |
for blk in blocks[-n:]:
|
| 504 |
_set_requires_grad(blk, True)
|
|
@@ -506,7 +510,7 @@ def unfreeze_last_stage(
|
|
| 506 |
|
| 507 |
if bb_type == "swin":
|
| 508 |
# Swin blocks are nested by stages and blocks; we flatten and unfreeze last n blocks.
|
| 509 |
-
# Swin 블록은 stage와 block으로 중첩되어 있어 펼친 후 마지막 n개를 unfreeze
|
| 510 |
stages = list(model.backbone.encoder.layers)
|
| 511 |
blocks: List[nn.Module] = []
|
| 512 |
for st in stages:
|
|
@@ -517,7 +521,7 @@ def unfreeze_last_stage(
|
|
| 517 |
|
| 518 |
if bb_type == "resnet":
|
| 519 |
# ResNet uses layer1..layer4 stages; we unfreeze at block granularity.
|
| 520 |
-
# ResNet은 layer1..layer4 stage를 사용하며 block 단위로 unfreeze
|
| 521 |
bb = model.backbone
|
| 522 |
for name in ("layer1", "layer2", "layer3", "layer4"):
|
| 523 |
if not hasattr(bb, name):
|
|
@@ -538,7 +542,7 @@ def unfreeze_last_stage(
|
|
| 538 |
|
| 539 |
if bb_type == "efficientnet":
|
| 540 |
# EfficientNet in transformers exposes features; we unfreeze from the tail blocks.
|
| 541 |
-
# transformers EfficientNet은 features를 노출하며 뒤쪽 블록부터 unfreeze
|
| 542 |
bb = model.backbone
|
| 543 |
if not hasattr(bb, "features"):
|
| 544 |
raise RuntimeError("Unexpected EfficientNet structure: missing features")
|
|
@@ -556,7 +560,7 @@ def unfreeze_last_stage(
|
|
| 556 |
|
| 557 |
if bb_type in ("timm_densenet", "torchvision_densenet"):
|
| 558 |
# DenseNet exposes a .features module with named blocks; we unfreeze last n submodules.
|
| 559 |
-
# DenseNet은 .features 모듈에 블록들이 이름으로 존재하며 마지막 n개 서브모듈을 unfreeze
|
| 560 |
bb = model.backbone
|
| 561 |
if not hasattr(bb, "features"):
|
| 562 |
raise RuntimeError("Unexpected DenseNet: missing features")
|
|
@@ -575,7 +579,7 @@ def unfreeze_last_stage(
|
|
| 575 |
|
| 576 |
def _denselayers(db: nn.Module) -> List[nn.Module]:
|
| 577 |
# Dense blocks contain multiple DenseLayer children; we return them for fine-grained unfreezing.
|
| 578 |
-
# denseblock은 DenseLayer 자식들을 가지므로 세밀한 unfreeze를 위해 이를
|
| 579 |
return list(db.children())
|
| 580 |
|
| 581 |
blocks: List[nn.Module] = []
|
|
@@ -600,5 +604,5 @@ def unfreeze_last_stage(
|
|
| 600 |
# register
|
| 601 |
# -------------------------
|
| 602 |
# Register for AutoModelForImageClassification so from_pretrained can resolve this custom class.
|
| 603 |
-
# from_pretrained가 이 커스텀 클래스를 해석할 수 있도록 AutoModelForImageClassification에
|
| 604 |
BackboneWithMLPHeadForImageClassification.register_for_auto_class("AutoModelForImageClassification")
|
|
|
|
| 94 |
|
| 95 |
def __init__(self, config: BackboneMLPHeadConfig):
|
| 96 |
# PreTrainedModel expects a config object and stores it internally.
|
| 97 |
+
# PreTrainedModel은 config 객체를 받아 내부에 저장함.
|
| 98 |
super().__init__(config)
|
| 99 |
|
| 100 |
# Fail-fast: the model is not meant to be instantiated without a valid backbone id.
|
| 101 |
+
# fail-fast: 유효한 backbone id 없이 모델을 만드는 사용 시나리오는 허용하지 않음 - fast fail.
|
| 102 |
#
|
| 103 |
# Note: Transformers may create configs with no args, but models are conventionally created with configs.
|
| 104 |
+
# 참고: Transformers는 config 무인자 생성이 있을 수 있으나, 모델은 관례적으로 config를 받아 생성.
|
| 105 |
if config.backbone_name_or_path is None:
|
| 106 |
raise ValueError(
|
| 107 |
"config.backbone_name_or_path is None. "
|
|
|
|
| 109 |
)
|
| 110 |
|
| 111 |
# Fail-fast: training/inference requires a positive number of labels.
|
| 112 |
+
# fail-fast: 학습/추론은 num_labels가 양수여야 함.
|
| 113 |
#
|
| 114 |
# Config may exist in a minimal form for internal serialization paths, but the model should not.
|
| 115 |
+
# config는 내부 직렬화 경로에서 최소 형태로 존재할 수 있으나 모델은 해당 없음.
|
| 116 |
if int(getattr(config, "num_labels", 0)) <= 0:
|
| 117 |
raise ValueError(
|
| 118 |
f"config.num_labels must be > 0, got {getattr(config, 'num_labels', None)}. "
|
|
|
|
| 120 |
)
|
| 121 |
|
| 122 |
# Meta is a single source of truth for extraction and fine-tuning rules.
|
| 123 |
+
# meta는 feature 추출 및 미세조정 규칙의 단일 기준.
|
| 124 |
+
# Resolve backbone meta from config (preferred) or fallback table (for backward compatibility).
|
| 125 |
# Prefer config.backbone_meta to keep Hub runtime self-contained.
|
| 126 |
self._meta = _resolve_backbone_meta(config, fallback_table=BACKBONE_META)
|
| 127 |
|
| 128 |
# Backbone skeleton is always created without pretrained weights.
|
| 129 |
+
# backbone skeleton은 항상 pretrained weight 없이 생성.
|
| 130 |
self.backbone = self._build_backbone_skeleton(config.backbone_name_or_path)
|
| 131 |
|
| 132 |
# Head shape is driven by meta feat_dim and config.num_labels.
|
| 133 |
+
# head shape은 meta의 feat_dim과 config.num_labels로 결정.
|
| 134 |
self.classifier = MLPHead(
|
| 135 |
in_dim=int(self._meta["feat_dim"]),
|
| 136 |
num_labels=int(config.num_labels),
|
|
|
|
| 139 |
)
|
| 140 |
|
| 141 |
# HF initialization hook, but we override init_weights to initialize head-only.
|
| 142 |
+
# HF 초기화 훅이지만 init_weights를 override하여 head만 초기화하도록 변경.
|
| 143 |
self.post_init()
|
| 144 |
|
| 145 |
def init_weights(self):
|
| 146 |
"""
|
| 147 |
Initialize only the head to avoid touching the backbone skeleton.
|
| 148 |
+
backbone skeleton을 건드리지 않기 위해 head만 초기화.
|
| 149 |
|
| 150 |
HF's default init may traverse the entire module tree, which is undesirable here.
|
| 151 |
+
HF 기본 init은 전체 모듈 트리를 순회할 수 있어 여기서 그대로 사용하기 부적절.
|
| 152 |
+
|
| 153 |
+
초기 설계에서 __init__ 내부에서 backbone의 가중치 로드를 수행함(편리를 위해).
|
| 154 |
+
이 경우, HF의 post_init()으로 인해 해당 로드가 취소되는 경우가 존재(timm, torchvision 등의 백본).
|
| 155 |
+
때문에 이를 오버라이드 하여 classifier만 초기화 하도록 변경함.
|
| 156 |
"""
|
| 157 |
if getattr(self, "classifier", None) is not None:
|
| 158 |
self.classifier.apply(self._init_weights)
|
|
|
|
| 164 |
# ----------------------------
|
| 165 |
def _build_backbone_skeleton(self, backbone_id: str) -> nn.Module:
|
| 166 |
# Meta decides which loader path to use.
|
| 167 |
+
# meta가 어떤 로더 경로를 사용할지 결정.
|
| 168 |
meta = self._meta if backbone_id == self.config.backbone_name_or_path else BACKBONE_META.get(backbone_id)
|
| 169 |
if meta is None:
|
| 170 |
raise KeyError(f"Unknown backbone_id={backbone_id}. Provide backbone_meta in config or extend BACKBONE_META.")
|
|
|
|
| 178 |
return self._build_torchvision_densenet_skeleton(backbone_id)
|
| 179 |
|
| 180 |
# For transformers backbones: build a random-weight skeleton from config only.
|
| 181 |
+
# transformers 백본: config로부터 랜덤 초기화 skeleton만 생성.
|
| 182 |
bb_cfg = AutoConfig.from_pretrained(backbone_id)
|
| 183 |
return AutoModel.from_config(bb_cfg)
|
| 184 |
|
| 185 |
@staticmethod
|
| 186 |
def _build_timm_densenet_skeleton(hf_repo_id: str) -> nn.Module:
|
| 187 |
# timm is an optional dependency and should be imported lazily.
|
| 188 |
+
# timm은 옵션 의존성이므로 지연 import 수행.
|
| 189 |
try:
|
| 190 |
import timm
|
| 191 |
except Exception as e:
|
|
|
|
| 194 |
) from e
|
| 195 |
|
| 196 |
# Build structure only (pretrained=False) and remove classifier head (num_classes=0).
|
| 197 |
+
# 구조만 생성(pretrained=False)하고 분류기 head는 제거(num_classes=0).
|
| 198 |
return timm.create_model(
|
| 199 |
f"hf_hub:{hf_repo_id}",
|
| 200 |
pretrained=False,
|
|
|
|
| 204 |
@staticmethod
|
| 205 |
def _build_torchvision_densenet_skeleton(model_id: str) -> nn.Module:
|
| 206 |
# This project intentionally supports only torchvision/densenet121 in the 224 whitelist.
|
| 207 |
+
# 이 프로젝트는 224 화이트리스트에서 torchvision/densenet121만 의도적으로 지원.
|
| 208 |
if model_id != "torchvision/densenet121":
|
| 209 |
raise ValueError(f"Unsupported torchvision DenseNet id (224 whitelist only): {model_id}")
|
| 210 |
|
| 211 |
# Build structure only (weights=None) to avoid implicit pretrained loading.
|
| 212 |
+
# implicit pretrained 로드를 피하기 위해 구조만 생성(weights=None).
|
| 213 |
m = tv_models.densenet121(weights=None)
|
| 214 |
return m
|
| 215 |
|
|
|
|
| 226 |
):
|
| 227 |
"""
|
| 228 |
Fresh-start only: inject pretrained backbone weights into the skeleton.
|
| 229 |
+
fresh-start 전용: skeleton backbone에 pretrained 가중치를 주입.
|
| 230 |
|
| 231 |
Do NOT call this after from_pretrained() because it would overwrite checkpoint weights.
|
| 232 |
+
from_pretrained() 이후 호출하면 체크포인트 가중치를 덮어쓰므로 주의할 것.
|
| 233 |
"""
|
| 234 |
bb = self.config.backbone_name_or_path
|
| 235 |
meta = self._meta
|
|
|
|
| 244 |
return
|
| 245 |
|
| 246 |
# For transformers backbones, load a reference pretrained model and copy weights into our skeleton.
|
| 247 |
+
# transformers 백본은 reference pretrained 모델을 로드한 뒤 skeleton에 가중치를 복사.
|
| 248 |
ref = AutoModel.from_pretrained(
|
| 249 |
bb,
|
| 250 |
low_cpu_mem_usage=low_cpu_mem_usage,
|
|
|
|
| 252 |
)
|
| 253 |
|
| 254 |
# strict=False is used to tolerate harmless key differences across minor versions.
|
| 255 |
+
# strict=False는 마이너 버전 차이로 인한 무해한 키 차이를 허용하기 위해 사용.
|
| 256 |
self.backbone.load_state_dict(ref.state_dict(), strict=False)
|
| 257 |
del ref
|
| 258 |
|
| 259 |
@torch.no_grad()
|
| 260 |
def _load_timm_pretrained_into_skeleton_(self, hf_repo_id: str):
|
| 261 |
# timm must be present for timm backbones.
|
| 262 |
+
# timm 백본에��� timm 설치가 필요.
|
| 263 |
import timm
|
| 264 |
|
| 265 |
# Create a pretrained reference model and copy its weights strictly.
|
| 266 |
+
# pretrained reference 모델을 만들고 가중치를 strict하게 복사.
|
| 267 |
ref = timm.create_model(
|
| 268 |
f"hf_hub:{hf_repo_id}",
|
| 269 |
pretrained=True,
|
|
|
|
| 276 |
@torch.no_grad()
|
| 277 |
def _load_torchvision_pretrained_into_skeleton_(self, model_id: str):
|
| 278 |
# This project intentionally supports only torchvision/densenet121 in the 224 whitelist.
|
| 279 |
+
# 이 프로젝트는 224 화이트리스트에서 torchvision/densenet121만 지원.
|
| 280 |
if model_id != "torchvision/densenet121":
|
| 281 |
raise ValueError(f"Unsupported torchvision DenseNet id (224 whitelist only): {model_id}")
|
| 282 |
|
| 283 |
# Use torchvision's default pretrained weights for densenet121.
|
| 284 |
+
# torchvision의 densenet121 기본 pretrained weights를 사용.
|
| 285 |
ref = tv_models.densenet121(weights=tv_models.DenseNet121_Weights.DEFAULT).eval()
|
| 286 |
|
| 287 |
self.backbone.load_state_dict(ref.state_dict(), strict=True)
|
|
|
|
| 294 |
@staticmethod
|
| 295 |
def _pool_or_gap(outputs) -> torch.Tensor:
|
| 296 |
# Some transformers vision CNNs provide pooler_output explicitly.
|
| 297 |
+
# 일부 transformers vision CNN은 pooler_output을 명시적으로 제공.
|
| 298 |
if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
|
| 299 |
x = outputs.pooler_output
|
| 300 |
if x.dim() == 2:
|
|
|
|
| 304 |
raise RuntimeError(f"Unexpected pooler_output shape: {tuple(x.shape)}")
|
| 305 |
|
| 306 |
# Otherwise we expect a CNN-style last_hidden_state=(B,C,H,W) and apply GAP.
|
| 307 |
+
# 그렇지 않으면 CNN 스타일 last_hidden_state=(B,C,H,W)를 기대하고 GAP을 적용.
|
| 308 |
x = outputs.last_hidden_state
|
| 309 |
if x.dim() == 4:
|
| 310 |
return x.mean(dim=(2, 3))
|
|
|
|
| 316 |
|
| 317 |
def _extract_features(self, outputs, pixel_values: Optional[torch.Tensor] = None) -> torch.Tensor:
|
| 318 |
# Feature rule is defined by BACKBONE_META and must remain stable across saves/loads.
|
| 319 |
+
# feature 규칙은 BACKBONE_META로 정의되며 저장/로드 간 안정적 동작을 위해 제한된 모델만 사용.
|
| 320 |
rule = self._meta["feat_rule"]
|
| 321 |
|
| 322 |
if rule == "cls":
|
| 323 |
# ViT-style: use CLS token embedding from last_hidden_state.
|
| 324 |
+
# ViT 스타일: last_hidden_state에서 CLS 토큰 임베딩을 사용.
|
| 325 |
return outputs.last_hidden_state[:, 0, :]
|
| 326 |
|
| 327 |
if rule == "pool_or_mean":
|
| 328 |
# Swin-style: prefer pooler_output if present, else mean-pool over tokens.
|
| 329 |
+
# Swin 스타일: pooler_output이 있으면 우선 사용하고, 없으면 토큰 평균 풀링을 사용.
|
| 330 |
if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
|
| 331 |
return outputs.pooler_output
|
| 332 |
return outputs.last_hidden_state.mean(dim=1)
|
| 333 |
|
| 334 |
if rule == "pool_or_gap":
|
| 335 |
# CNN-style: use pooler_output if present, else GAP over spatial dims.
|
| 336 |
+
# CNN 스타일: pooler_output이 있으면 사용하고, 없으면 공간 차원 GAP을 사용.
|
| 337 |
return self._pool_or_gap(outputs)
|
| 338 |
|
| 339 |
if rule == "timm_gap":
|
| 340 |
# timm forward_features returns a feature map (B,C,H,W) which we GAP to (B,C).
|
| 341 |
+
# timm forward_features는 (B,C,H,W) feature map을 반환하며 이를 GAP으로 (B,C)로 변환.
|
| 342 |
if not isinstance(outputs, torch.Tensor):
|
| 343 |
raise TypeError(f"timm_gap expects Tensor features, got {type(outputs)}")
|
| 344 |
if outputs.dim() != 4:
|
|
|
|
| 347 |
|
| 348 |
if rule == "torchvision_densenet_gap":
|
| 349 |
# torchvision DenseNet features are feature maps (B,C,H,W) and require GAP.
|
| 350 |
+
# torchvision DenseNet features는 (B,C,H,W) feature map이며 GAP이 필요.
|
| 351 |
if not isinstance(outputs, torch.Tensor):
|
| 352 |
raise TypeError(f"torchvision_densenet_gap expects Tensor, got {type(outputs)}")
|
| 353 |
if outputs.dim() != 4:
|
|
|
|
| 366 |
**kwargs,
|
| 367 |
):
|
| 368 |
# Type decides the backbone forward path and output format.
|
| 369 |
+
# type이 backbone forward 경로 및 출력 포맷을 결정.
|
| 370 |
t = self._meta["type"]
|
| 371 |
|
| 372 |
if t == "timm_densenet":
|
|
|
|
| 398 |
|
| 399 |
else:
|
| 400 |
# Transformers vision models are called with pixel_values and return ModelOutput.
|
| 401 |
+
# transformers vision 모델은 pixel_values로 호출되며 ModelOutput을 반환.
|
| 402 |
outputs = self.backbone(
|
| 403 |
pixel_values=pixel_values,
|
| 404 |
output_attentions=output_attentions,
|
|
|
|
| 411 |
attentions = getattr(outputs, "attentions", None)
|
| 412 |
|
| 413 |
# Classifier consumes (B, feat_dim) and returns logits (B, num_labels).
|
| 414 |
+
# classifier는 (B, feat_dim)을 받아 logits (B, num_labels)를 반환.
|
| 415 |
logits = self.classifier(feats)
|
| 416 |
|
| 417 |
loss = None
|
| 418 |
if labels is not None:
|
| 419 |
# Cross entropy expects labels as class indices in [0, num_labels).
|
| 420 |
+
# cross entropy는 labels가 [0, num_labels) 범위의 class index이길 기대함.
|
| 421 |
loss = F.cross_entropy(logits, labels)
|
| 422 |
|
| 423 |
if not return_dict:
|
|
|
|
| 438 |
# ============================================================
|
| 439 |
def _set_requires_grad(module: nn.Module, flag: bool):
|
| 440 |
# Toggle requires_grad for all parameters in a module.
|
| 441 |
+
# 모듈의 모든 파라미터에 대해 requires_grad를 토글.
|
| 442 |
for p in module.parameters():
|
| 443 |
p.requires_grad = flag
|
| 444 |
|
| 445 |
|
| 446 |
def set_bn_eval(module: nn.Module):
|
| 447 |
# Put BatchNorm layers into eval mode to freeze running stats.
|
| 448 |
+
# BatchNorm 레이어를 eval 모드로 두어 running stats를 고정.
|
| 449 |
for m in module.modules():
|
| 450 |
if isinstance(m, (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.SyncBatchNorm)):
|
| 451 |
m.eval()
|
|
|
|
| 453 |
|
| 454 |
def freeze_backbone(model: BackboneWithMLPHeadForImageClassification, freeze_bn: bool = True):
|
| 455 |
# Stage1: freeze backbone and train only the head.
|
| 456 |
+
# stage1: backbone을 freeze하고 head만 학습.
|
| 457 |
_set_requires_grad(model.backbone, False)
|
| 458 |
_set_requires_grad(model.classifier, True)
|
| 459 |
|
|
|
|
| 464 |
|
| 465 |
def finetune_train_mode(model: BackboneWithMLPHeadForImageClassification, keep_bn_eval: bool = True):
|
| 466 |
# Stage2: train mode, optionally keeping BN layers in eval for stability.
|
| 467 |
+
# stage2: train 모드로 두되 안정성을 위해 BN을 eval로 유지할 수 있음. (buffer 등을 유지하기 위해)
|
| 468 |
model.train()
|
| 469 |
meta = getattr(model, "_meta", None) or getattr(model.config, "backbone_meta", None)
|
| 470 |
if keep_bn_eval and meta.get("has_bn", False):
|
|
|
|
| 473 |
|
| 474 |
def trainable_summary(model: nn.Module):
|
| 475 |
# Print a compact summary of trainable parameters.
|
| 476 |
+
# 학습 가능 파라미터 요약을 간단히 출력.
|
| 477 |
total = sum(p.numel() for p in model.parameters())
|
| 478 |
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
| 479 |
ratio = trainable / total if total > 0 else 0.0
|
|
|
|
| 487 |
keep_bn_eval: bool = True,
|
| 488 |
):
|
| 489 |
# This utility implements BACKBONE_META['unfreeze']=="last_n" across supported backbones.
|
| 490 |
+
# 이 유틸은 지원 백본들에 대해 BACKBONE_META['unfreeze']=="last_n"을 구현.
|
| 491 |
freeze_backbone(model, freeze_bn=keep_bn_eval)
|
| 492 |
|
| 493 |
n = int(last_n)
|
|
|
|
| 502 |
|
| 503 |
if bb_type == "vit":
|
| 504 |
# ViT blocks live under backbone.encoder.layer in the transformers implementation.
|
| 505 |
+
# ViT 블록은 transformers 구현에서 backbone.encoder.layer 아래에 존재함.
|
| 506 |
blocks = list(model.backbone.encoder.layer)
|
| 507 |
for blk in blocks[-n:]:
|
| 508 |
_set_requires_grad(blk, True)
|
|
|
|
| 510 |
|
| 511 |
if bb_type == "swin":
|
| 512 |
# Swin blocks are nested by stages and blocks; we flatten and unfreeze last n blocks.
|
| 513 |
+
# Swin 블록은 stage와 block으로 중첩되어 있어 펼친 후 마지막 n개를 unfreeze.
|
| 514 |
stages = list(model.backbone.encoder.layers)
|
| 515 |
blocks: List[nn.Module] = []
|
| 516 |
for st in stages:
|
|
|
|
| 521 |
|
| 522 |
if bb_type == "resnet":
|
| 523 |
# ResNet uses layer1..layer4 stages; we unfreeze at block granularity.
|
| 524 |
+
# ResNet은 layer1..layer4 stage를 사용하며 block 단위로 unfreeze.
|
| 525 |
bb = model.backbone
|
| 526 |
for name in ("layer1", "layer2", "layer3", "layer4"):
|
| 527 |
if not hasattr(bb, name):
|
|
|
|
| 542 |
|
| 543 |
if bb_type == "efficientnet":
|
| 544 |
# EfficientNet in transformers exposes features; we unfreeze from the tail blocks.
|
| 545 |
+
# transformers EfficientNet은 features를 노출하며 뒤쪽 블록부터 unfreeze.
|
| 546 |
bb = model.backbone
|
| 547 |
if not hasattr(bb, "features"):
|
| 548 |
raise RuntimeError("Unexpected EfficientNet structure: missing features")
|
|
|
|
| 560 |
|
| 561 |
if bb_type in ("timm_densenet", "torchvision_densenet"):
|
| 562 |
# DenseNet exposes a .features module with named blocks; we unfreeze last n submodules.
|
| 563 |
+
# DenseNet은 .features 모듈에 블록들이 이름으로 존재하며 마지막 n개 서브모듈을 unfreeze.
|
| 564 |
bb = model.backbone
|
| 565 |
if not hasattr(bb, "features"):
|
| 566 |
raise RuntimeError("Unexpected DenseNet: missing features")
|
|
|
|
| 579 |
|
| 580 |
def _denselayers(db: nn.Module) -> List[nn.Module]:
|
| 581 |
# Dense blocks contain multiple DenseLayer children; we return them for fine-grained unfreezing.
|
| 582 |
+
# denseblock은 DenseLayer 자식들을 가지므로 세밀한 unfreeze를 위해 이를 반환.
|
| 583 |
return list(db.children())
|
| 584 |
|
| 585 |
blocks: List[nn.Module] = []
|
|
|
|
| 604 |
# register
|
| 605 |
# -------------------------
|
| 606 |
# Register for AutoModelForImageClassification so from_pretrained can resolve this custom class.
|
| 607 |
+
# from_pretrained가 이 커스텀 클래스를 해석할 수 있도록 AutoModelForImageClassification에 등록.
|
| 608 |
BackboneWithMLPHeadForImageClassification.register_for_auto_class("AutoModelForImageClassification")
|
models/timm__densenet121.tv_in1k/ds_proc.py
CHANGED
|
@@ -4,8 +4,8 @@
|
|
| 4 |
# src/ds_proc.py
|
| 5 |
|
| 6 |
# ============================================================
|
| 7 |
-
#
|
| 8 |
-
#
|
| 9 |
# ============================================================
|
| 10 |
|
| 11 |
from typing import Any
|
|
@@ -27,41 +27,38 @@ except ImportError:
|
|
| 27 |
class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
| 28 |
"""
|
| 29 |
This processor performs image preprocessing and outputs {"pixel_values": ...}.
|
| 30 |
-
이 processor는 이미지 전처리를 수행하고 {"pixel_values": ...}를
|
| 31 |
|
| 32 |
Key requirements:
|
| 33 |
핵심 요구사항:
|
| 34 |
|
| 35 |
1) save_pretrained() must produce a JSON-serializable preprocessor_config.json.
|
| 36 |
-
save_pretrained()는 JSON 직렬화 가능한 preprocessor_config.json을 생성해야
|
| 37 |
-
|
| 38 |
2) Runtime-only objects (delegate processor, timm/torchvision transforms) must NOT be serialized.
|
| 39 |
-
런타임 객체(delegate processor, timm/torchvision transform)는 절대 직렬화하면 안
|
| 40 |
-
|
| 41 |
3) Runtime objects are rebuilt at init/load time based on backbone meta.
|
| 42 |
-
런타임 객체는 backbone meta에 따라 init/load 시점에
|
| 43 |
-
|
| 44 |
4) For reproducibility, use_fast must be explicitly persisted and honored on load.
|
| 45 |
-
재현성을 위해 use_fast는 명시적으로 저장되고, 로드시 반드시 반영되어야
|
| 46 |
"""
|
| 47 |
|
| 48 |
# HF vision models conventionally expect "pixel_values" as the primary input key.
|
| 49 |
-
# HF vision 모델은 관례적으로 입력 키로 "pixel_values"를
|
| 50 |
model_input_names = ["pixel_values"]
|
| 51 |
|
| 52 |
def __init__(
|
| 53 |
self,
|
| 54 |
backbone_name_or_path: BackboneID,
|
| 55 |
-
is_training: bool = False,
|
| 56 |
use_fast: bool = False,
|
| 57 |
**kwargs,
|
| 58 |
):
|
| 59 |
# ImageProcessingMixin stores extra kwargs and manages auto_map metadata.
|
| 60 |
-
# ImageProcessingMixin은 추가 kwargs를 저장하고 auto_map 메타를
|
| 61 |
super().__init__(**kwargs)
|
| 62 |
|
| 63 |
# Enforce whitelist via BACKBONE_META to keep behavior stable.
|
| 64 |
-
# 동작 안정성을 위해 BACKBONE_META 기반 화이트리스트를
|
| 65 |
if backbone_name_or_path not in BACKBONE_META:
|
| 66 |
raise ValueError(
|
| 67 |
f"Unsupported backbone_name_or_path={backbone_name_or_path}. "
|
|
@@ -69,23 +66,23 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 69 |
)
|
| 70 |
|
| 71 |
# Serializable fields only: these should appear in preprocessor_config.json.
|
| 72 |
-
# 직렬화 가능한 필드만: 이 값들만 preprocessor_config.json에 들어가야
|
| 73 |
self.backbone_name_or_path = backbone_name_or_path
|
| 74 |
self.is_training = bool(is_training)
|
| 75 |
|
| 76 |
# Reproducibility switch for transformers processors.
|
| 77 |
-
# transformers processor의 fast/slow 선택을 재현 가능하게
|
| 78 |
self.use_fast = bool(use_fast)
|
| 79 |
|
| 80 |
# Runtime-only fields: must never be serialized.
|
| 81 |
-
# 런타임 전용 필드: 절대 직렬화되면 안
|
| 82 |
self._meta = None
|
| 83 |
-
self._delegate
|
| 84 |
-
self._timm_transform
|
| 85 |
self._torchvision_transform = None
|
| 86 |
|
| 87 |
# Build runtime objects according to backbone type.
|
| 88 |
-
# backbone type에 따라 런타임 객체를
|
| 89 |
self._build_runtime()
|
| 90 |
|
| 91 |
# ============================================================
|
|
@@ -95,13 +92,13 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 95 |
def _build_runtime(self):
|
| 96 |
"""
|
| 97 |
Build runtime delegate/transform based on BACKBONE_META["type"].
|
| 98 |
-
BACKBONE_META["type"]에 따라 런타임 delegate/transform을
|
| 99 |
"""
|
| 100 |
meta = BACKBONE_META[self.backbone_name_or_path]
|
| 101 |
self._meta = meta
|
| 102 |
|
| 103 |
# Always reset runtime fields before rebuilding.
|
| 104 |
-
# 재구성 전 런타임 필드는 항상
|
| 105 |
self._delegate = None
|
| 106 |
self._timm_transform = None
|
| 107 |
self._torchvision_transform = None
|
|
@@ -110,7 +107,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 110 |
|
| 111 |
if t == "timm_densenet":
|
| 112 |
# timm DenseNet uses timm.data transforms for ImageNet-style preprocessing.
|
| 113 |
-
# timm DenseNet은 ImageNet 전처리를 위해 timm.data transform을
|
| 114 |
self._timm_transform = self._build_timm_transform(
|
| 115 |
backbone_id=self.backbone_name_or_path,
|
| 116 |
is_training=self.is_training,
|
|
@@ -119,17 +116,17 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 119 |
|
| 120 |
if t == "torchvision_densenet":
|
| 121 |
# torchvision DenseNet requires torchvision-style preprocessing (resize/crop/tensor/normalize).
|
| 122 |
-
# torchvision DenseNet은 torchvision 스타일 전처리(resize/crop/tensor/normalize)가
|
| 123 |
self._torchvision_transform = self._build_torchvision_densenet_transform(
|
| 124 |
is_training=self.is_training
|
| 125 |
)
|
| 126 |
return
|
| 127 |
|
| 128 |
# Default: transformers backbone delegates to its official AutoImageProcessor.
|
| 129 |
-
# 기본: transformers 백본은 공식 AutoImageProcessor에
|
| 130 |
#
|
| 131 |
# IMPORTANT:
|
| 132 |
-
# - use_fast는 transformers 기본값 변경에 흔들리지 않도록 반드시 명시적으로
|
| 133 |
self._delegate = AutoImageProcessor.from_pretrained(
|
| 134 |
self.backbone_name_or_path,
|
| 135 |
use_fast=self.use_fast,
|
|
@@ -140,7 +137,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 140 |
def _build_timm_transform(*, backbone_id: str, is_training: bool):
|
| 141 |
"""
|
| 142 |
Create timm transform without storing non-serializable objects in config.
|
| 143 |
-
비직렬화 객체를 config에 저장하지 않고 timm transform을
|
| 144 |
"""
|
| 145 |
try:
|
| 146 |
import timm
|
|
@@ -151,20 +148,20 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 151 |
) from e
|
| 152 |
|
| 153 |
# We only need model metadata to resolve data config, so pretrained=False is preferred.
|
| 154 |
-
# data config 추출만 필요하므로 pretrained=False를 우선
|
| 155 |
m = timm.create_model(f"hf_hub:{backbone_id}", pretrained=False, num_classes=0)
|
| 156 |
dc = resolve_model_data_config(m)
|
| 157 |
|
| 158 |
# create_transform returns a torchvision-like callable that maps PIL -> torch.Tensor(C,H,W).
|
| 159 |
-
# create_transform은 PIL -> torch.Tensor(C,H,W)로 매핑하는 callable을
|
| 160 |
-
tfm = create_transform(**dc, is_training=is_training)
|
| 161 |
return tfm
|
| 162 |
|
| 163 |
@staticmethod
|
| 164 |
def _build_torchvision_densenet_transform(*, is_training: bool):
|
| 165 |
"""
|
| 166 |
Build torchvision preprocessing for DenseNet-121 (224 pipeline).
|
| 167 |
-
DenseNet-121용 torchvision 전처리(224 파이프라인)를
|
| 168 |
"""
|
| 169 |
try:
|
| 170 |
from torchvision import transforms
|
|
@@ -174,28 +171,29 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 174 |
) from e
|
| 175 |
|
| 176 |
# These are the standard ImageNet normalization stats used by torchvision weights.
|
| 177 |
-
# 이 값들은 torchvision weights가 사용하는 표준 ImageNet 정규화
|
| 178 |
mean = (0.485, 0.456, 0.406)
|
| 179 |
-
std
|
| 180 |
|
| 181 |
# Training pipeline typically uses RandomResizedCrop and horizontal flip.
|
| 182 |
-
# 학습 파이프라인은 보통 RandomResizedCrop과 좌우반전을
|
| 183 |
if is_training:
|
| 184 |
return transforms.Compose(
|
| 185 |
[
|
| 186 |
-
transforms.RandomResizedCrop(224),
|
| 187 |
-
transforms.RandomHorizontalFlip(p=0.5),
|
|
|
|
| 188 |
transforms.ToTensor(),
|
| 189 |
transforms.Normalize(mean=mean, std=std),
|
| 190 |
]
|
| 191 |
)
|
| 192 |
|
| 193 |
# Inference pipeline typically uses Resize(256) + CenterCrop(224).
|
| 194 |
-
# 추론 파이프라인은 보통 Resize(256) + CenterCrop(224)를
|
| 195 |
return transforms.Compose(
|
| 196 |
[
|
| 197 |
transforms.Resize(256),
|
| 198 |
-
transforms.CenterCrop(224),
|
| 199 |
transforms.ToTensor(),
|
| 200 |
transforms.Normalize(mean=mean, std=std),
|
| 201 |
]
|
|
@@ -208,24 +206,24 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 208 |
def to_dict(self) -> dict[str, Any]:
|
| 209 |
"""
|
| 210 |
Return a JSON-serializable dict for preprocessor_config.json.
|
| 211 |
-
preprocessor_config.json에 들어갈 JSON 직렬화 dict를
|
| 212 |
|
| 213 |
Important: do not leak runtime objects into the serialized dict.
|
| 214 |
-
중요: 런타임 객체가 직렬화 dict에 섞이면 안
|
| 215 |
"""
|
| 216 |
# ImageProcessingMixin.to_dict() adds metadata such as image_processor_type/auto_map.
|
| 217 |
# ImageProcessingMixin.to_dict()는 image_processor_type/auto_map 같은 메타를 추가합니다.
|
| 218 |
d = super().to_dict()
|
| 219 |
|
| 220 |
# Force minimal stable fields for long-term compatibility.
|
| 221 |
-
# 장기 호환을 위해 최소 안정 필드를
|
| 222 |
-
d["image_processor_type"]
|
| 223 |
d["backbone_name_or_path"] = self.backbone_name_or_path
|
| 224 |
d["is_training"] = self.is_training
|
| 225 |
-
d["use_fast"]
|
| 226 |
|
| 227 |
# Remove any runtime-only fields defensively.
|
| 228 |
-
# 런타임 전용 필드는 보수적으로
|
| 229 |
for key in ["_meta", "_delegate", "_timm_transform", "_torchvision_transform"]:
|
| 230 |
d.pop(key, None)
|
| 231 |
|
|
@@ -235,14 +233,14 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 235 |
def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs):
|
| 236 |
"""
|
| 237 |
Standard load path used by BaseImageProcessor / AutoImageProcessor.
|
| 238 |
-
BaseImageProcessor / AutoImageProcessor가 사용하는 표준 로드
|
| 239 |
"""
|
| 240 |
backbone = image_processor_dict.get("backbone_name_or_path", None)
|
| 241 |
if backbone is None:
|
| 242 |
raise ValueError("preprocessor_config.json missing key: backbone_name_or_path")
|
| 243 |
|
| 244 |
is_training = bool(image_processor_dict.get("is_training", False))
|
| 245 |
-
use_fast
|
| 246 |
|
| 247 |
return cls(
|
| 248 |
backbone_name_or_path=backbone,
|
|
@@ -255,20 +253,20 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 255 |
def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
|
| 256 |
"""
|
| 257 |
Fallback path if AutoImageProcessor calls class.from_pretrained directly.
|
| 258 |
-
AutoImageProcessor가 class.from_pretrained를 직접 호출하는 경우를 대비한
|
| 259 |
|
| 260 |
Strategy:
|
| 261 |
전략:
|
| 262 |
|
| 263 |
- Read config.json via AutoConfig and recover backbone_name_or_path.
|
| 264 |
-
AutoConfig로 config.json을 읽고 backbone_name_or_path를
|
| 265 |
"""
|
| 266 |
|
| 267 |
# is_training is runtime-only and should default to False for inference/serving.
|
| 268 |
-
# is_training은 런타임 전용이며 추론/서빙 기본값은 False
|
| 269 |
#
|
| 270 |
# IMPORTANT:
|
| 271 |
-
# - use_fast는 kwargs로 전달될 수 있으므로, 있으면
|
| 272 |
use_fast = bool(kwargs.pop("use_fast", False))
|
| 273 |
|
| 274 |
kwargs.pop("trust_remote_code", None)
|
|
@@ -289,7 +287,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 289 |
@staticmethod
|
| 290 |
def _ensure_list(images: Any) -> list[Any]:
|
| 291 |
# Normalize scalar image input to a list for uniform processing.
|
| 292 |
-
# 단일 입력을 리스트로 정규화하여 동일한 처리 경로를
|
| 293 |
if isinstance(images, (list, tuple)):
|
| 294 |
return list(images)
|
| 295 |
return [images]
|
|
@@ -297,7 +295,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 297 |
@staticmethod
|
| 298 |
def _to_pil_rgb(x: Any):
|
| 299 |
# Convert common image inputs into PIL RGB images.
|
| 300 |
-
# 일반적인 입력을 PIL RGB 이미지로
|
| 301 |
from PIL import Image as PILImage
|
| 302 |
|
| 303 |
if isinstance(x, PILImage.Image):
|
|
@@ -314,17 +312,17 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 314 |
) -> dict[str, Any]:
|
| 315 |
"""
|
| 316 |
Convert images into {"pixel_values": Tensor/ndarray}.
|
| 317 |
-
이미지를 {"pixel_values": Tensor/ndarray}로
|
| 318 |
"""
|
| 319 |
images = self._ensure_list(images)
|
| 320 |
|
| 321 |
# Rebuild runtime if needed (e.g., right after deserialization).
|
| 322 |
-
# 직렬화 복원 직후 등 런타임이 비어있을 수 있으므로
|
| 323 |
if (self._delegate is None) and (self._timm_transform is None) and (self._torchvision_transform is None):
|
| 324 |
self._build_runtime()
|
| 325 |
|
| 326 |
# timm path: PIL -> torch.Tensor(C,H,W) normalized float32.
|
| 327 |
-
# timm 경로: PIL -> torch.Tensor(C,H,W) 정규화 float32
|
| 328 |
if self._timm_transform is not None:
|
| 329 |
pv: list[torch.Tensor] = []
|
| 330 |
for im in images:
|
|
@@ -337,7 +335,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 337 |
return self._format_return(pixel_values, return_tensors)
|
| 338 |
|
| 339 |
# torchvision path: PIL -> torch.Tensor(C,H,W) normalized float32.
|
| 340 |
-
# torchvision 경로: PIL -> torch.Tensor(C,H,W) 정규화 float32
|
| 341 |
if self._torchvision_transform is not None:
|
| 342 |
pv: list[torch.Tensor] = []
|
| 343 |
for im in images:
|
|
@@ -350,7 +348,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 350 |
return self._format_return(pixel_values, return_tensors)
|
| 351 |
|
| 352 |
# transformers delegate path: rely on official processor behavior.
|
| 353 |
-
# transformers 위임 경로: 공식 processor 동작을 그대로
|
| 354 |
if self._delegate is None:
|
| 355 |
raise RuntimeError("Processor runtime not built: delegate is None and no transforms are available.")
|
| 356 |
|
|
@@ -360,7 +358,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 360 |
def _format_return(pixel_values: torch.Tensor, return_tensors: str | TensorType | None) -> dict[str, Any]:
|
| 361 |
"""
|
| 362 |
Format pixel_values according to return_tensors.
|
| 363 |
-
return_tensors에 맞춰 pixel_values 반환 포맷을
|
| 364 |
"""
|
| 365 |
if return_tensors is None or return_tensors in ("pt", TensorType.PYTORCH):
|
| 366 |
return {"pixel_values": pixel_values}
|
|
@@ -370,6 +368,6 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 370 |
|
| 371 |
|
| 372 |
# Register this processor for AutoImageProcessor resolution.
|
| 373 |
-
# AutoImageProcessor 해석을 위해 이 processor를
|
| 374 |
if __name__ != "__main__":
|
| 375 |
BackboneMLPHead224ImageProcessor.register_for_auto_class("AutoImageProcessor")
|
|
|
|
| 4 |
# src/ds_proc.py
|
| 5 |
|
| 6 |
# ============================================================
|
| 7 |
+
# ImageProcessor (AutoImageProcessor integration)
|
| 8 |
+
# ImageProcessor (AutoImageProcessor 연동)
|
| 9 |
# ============================================================
|
| 10 |
|
| 11 |
from typing import Any
|
|
|
|
| 27 |
class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
| 28 |
"""
|
| 29 |
This processor performs image preprocessing and outputs {"pixel_values": ...}.
|
| 30 |
+
이 processor는 이미지 전처리를 수행하고 {"pixel_values": ...}를 반환함.
|
| 31 |
|
| 32 |
Key requirements:
|
| 33 |
핵심 요구사항:
|
| 34 |
|
| 35 |
1) save_pretrained() must produce a JSON-serializable preprocessor_config.json.
|
| 36 |
+
save_pretrained()는 JSON 직렬화 가능한 preprocessor_config.json을 생성해야 함.
|
|
|
|
| 37 |
2) Runtime-only objects (delegate processor, timm/torchvision transforms) must NOT be serialized.
|
| 38 |
+
런타임 객체(delegate processor, timm/torchvision transform)는 절대 직렬화하면 안 됨.
|
|
|
|
| 39 |
3) Runtime objects are rebuilt at init/load time based on backbone meta.
|
| 40 |
+
런타임 객체는 backbone meta에 따라 init/load 시점에 재구성.
|
|
|
|
| 41 |
4) For reproducibility, use_fast must be explicitly persisted and honored on load.
|
| 42 |
+
재현성을 위해 use_fast는 명시적으로 저장되고, 로드시 반드시 반영되어야 함.
|
| 43 |
"""
|
| 44 |
|
| 45 |
# HF vision models conventionally expect "pixel_values" as the primary input key.
|
| 46 |
+
# HF vision 모델은 관례적으로 입력 키로 "pixel_values"를 기대.
|
| 47 |
model_input_names = ["pixel_values"]
|
| 48 |
|
| 49 |
def __init__(
|
| 50 |
self,
|
| 51 |
backbone_name_or_path: BackboneID,
|
| 52 |
+
is_training: bool = False, # timm 에서 data augmentation 용.
|
| 53 |
use_fast: bool = False,
|
| 54 |
**kwargs,
|
| 55 |
):
|
| 56 |
# ImageProcessingMixin stores extra kwargs and manages auto_map metadata.
|
| 57 |
+
# ImageProcessingMixin은 추가 kwargs를 저장하고 auto_map 메타를 관리.
|
| 58 |
super().__init__(**kwargs)
|
| 59 |
|
| 60 |
# Enforce whitelist via BACKBONE_META to keep behavior stable.
|
| 61 |
+
# 동작 안정성을 위해 BACKBONE_META 기반 화이트리스트를 강제. - fast fail
|
| 62 |
if backbone_name_or_path not in BACKBONE_META:
|
| 63 |
raise ValueError(
|
| 64 |
f"Unsupported backbone_name_or_path={backbone_name_or_path}. "
|
|
|
|
| 66 |
)
|
| 67 |
|
| 68 |
# Serializable fields only: these should appear in preprocessor_config.json.
|
| 69 |
+
# 직렬화 가능한 필드만: 이 값들만 preprocessor_config.json에 들어가야 함
|
| 70 |
self.backbone_name_or_path = backbone_name_or_path
|
| 71 |
self.is_training = bool(is_training)
|
| 72 |
|
| 73 |
# Reproducibility switch for transformers processors.
|
| 74 |
+
# transformers processor의 fast/slow 선택을 재현 가능하게 고정.
|
| 75 |
self.use_fast = bool(use_fast)
|
| 76 |
|
| 77 |
# Runtime-only fields: must never be serialized.
|
| 78 |
+
# 런타임 전용 필드: 절대 직렬화되면 안 됨.
|
| 79 |
self._meta = None
|
| 80 |
+
self._delegate = None
|
| 81 |
+
self._timm_transform = None
|
| 82 |
self._torchvision_transform = None
|
| 83 |
|
| 84 |
# Build runtime objects according to backbone type.
|
| 85 |
+
# backbone type에 따라 런타임 객체를 구성.
|
| 86 |
self._build_runtime()
|
| 87 |
|
| 88 |
# ============================================================
|
|
|
|
| 92 |
def _build_runtime(self):
|
| 93 |
"""
|
| 94 |
Build runtime delegate/transform based on BACKBONE_META["type"].
|
| 95 |
+
BACKBONE_META["type"]에 따라 런타임 delegate/transform을 구성.
|
| 96 |
"""
|
| 97 |
meta = BACKBONE_META[self.backbone_name_or_path]
|
| 98 |
self._meta = meta
|
| 99 |
|
| 100 |
# Always reset runtime fields before rebuilding.
|
| 101 |
+
# 재구성 전 런타임 필드는 항상 초기화.
|
| 102 |
self._delegate = None
|
| 103 |
self._timm_transform = None
|
| 104 |
self._torchvision_transform = None
|
|
|
|
| 107 |
|
| 108 |
if t == "timm_densenet":
|
| 109 |
# timm DenseNet uses timm.data transforms for ImageNet-style preprocessing.
|
| 110 |
+
# timm DenseNet은 ImageNet 전처리를 위해 timm.data transform을 사용.
|
| 111 |
self._timm_transform = self._build_timm_transform(
|
| 112 |
backbone_id=self.backbone_name_or_path,
|
| 113 |
is_training=self.is_training,
|
|
|
|
| 116 |
|
| 117 |
if t == "torchvision_densenet":
|
| 118 |
# torchvision DenseNet requires torchvision-style preprocessing (resize/crop/tensor/normalize).
|
| 119 |
+
# torchvision DenseNet은 torchvision 스타일 전처리(resize/crop/tensor/normalize)가 필요.
|
| 120 |
self._torchvision_transform = self._build_torchvision_densenet_transform(
|
| 121 |
is_training=self.is_training
|
| 122 |
)
|
| 123 |
return
|
| 124 |
|
| 125 |
# Default: transformers backbone delegates to its official AutoImageProcessor.
|
| 126 |
+
# 기본: transformers 백본은 공식 AutoImageProcessor에 위임.
|
| 127 |
#
|
| 128 |
# IMPORTANT:
|
| 129 |
+
# - use_fast는 transformers 기본값 변경에 흔들리지 않도록 반드시 명시적으로 전달.
|
| 130 |
self._delegate = AutoImageProcessor.from_pretrained(
|
| 131 |
self.backbone_name_or_path,
|
| 132 |
use_fast=self.use_fast,
|
|
|
|
| 137 |
def _build_timm_transform(*, backbone_id: str, is_training: bool):
|
| 138 |
"""
|
| 139 |
Create timm transform without storing non-serializable objects in config.
|
| 140 |
+
비직렬화 객체를 config에 저장하지 않고 timm transform을 생성.
|
| 141 |
"""
|
| 142 |
try:
|
| 143 |
import timm
|
|
|
|
| 148 |
) from e
|
| 149 |
|
| 150 |
# We only need model metadata to resolve data config, so pretrained=False is preferred.
|
| 151 |
+
# data config 추출만 필요하므로 pretrained=False를 우선 사용.
|
| 152 |
m = timm.create_model(f"hf_hub:{backbone_id}", pretrained=False, num_classes=0)
|
| 153 |
dc = resolve_model_data_config(m)
|
| 154 |
|
| 155 |
# create_transform returns a torchvision-like callable that maps PIL -> torch.Tensor(C,H,W).
|
| 156 |
+
# create_transform은 PIL -> torch.Tensor(C,H,W)로 매핑하는 callable을 반환.
|
| 157 |
+
tfm = create_transform(**dc, is_training=is_training) # is_training :Data Aug.
|
| 158 |
return tfm
|
| 159 |
|
| 160 |
@staticmethod
|
| 161 |
def _build_torchvision_densenet_transform(*, is_training: bool):
|
| 162 |
"""
|
| 163 |
Build torchvision preprocessing for DenseNet-121 (224 pipeline).
|
| 164 |
+
DenseNet-121용 torchvision 전처리(224 파이프라인)를 구성.
|
| 165 |
"""
|
| 166 |
try:
|
| 167 |
from torchvision import transforms
|
|
|
|
| 171 |
) from e
|
| 172 |
|
| 173 |
# These are the standard ImageNet normalization stats used by torchvision weights.
|
| 174 |
+
# 이 값들은 torchvision weights가 사용하는 표준 ImageNet 정규화 통계.
|
| 175 |
mean = (0.485, 0.456, 0.406)
|
| 176 |
+
std = (0.229, 0.224, 0.225)
|
| 177 |
|
| 178 |
# Training pipeline typically uses RandomResizedCrop and horizontal flip.
|
| 179 |
+
# 학습 파이프라인은 보통 RandomResizedCrop과 좌우반전을 사용.
|
| 180 |
if is_training:
|
| 181 |
return transforms.Compose(
|
| 182 |
[
|
| 183 |
+
# transforms.RandomResizedCrop(224),
|
| 184 |
+
# transforms.RandomHorizontalFlip(p=0.5),
|
| 185 |
+
transforms.Resize(224),
|
| 186 |
transforms.ToTensor(),
|
| 187 |
transforms.Normalize(mean=mean, std=std),
|
| 188 |
]
|
| 189 |
)
|
| 190 |
|
| 191 |
# Inference pipeline typically uses Resize(256) + CenterCrop(224).
|
| 192 |
+
# 추론 파이프라인은 보통 Resize(256) + CenterCrop(224)를 사용.
|
| 193 |
return transforms.Compose(
|
| 194 |
[
|
| 195 |
transforms.Resize(256),
|
| 196 |
+
# transforms.CenterCrop(224),
|
| 197 |
transforms.ToTensor(),
|
| 198 |
transforms.Normalize(mean=mean, std=std),
|
| 199 |
]
|
|
|
|
| 206 |
def to_dict(self) -> dict[str, Any]:
|
| 207 |
"""
|
| 208 |
Return a JSON-serializable dict for preprocessor_config.json.
|
| 209 |
+
preprocessor_config.json에 들어갈 JSON 직렬화 dict를 반환.
|
| 210 |
|
| 211 |
Important: do not leak runtime objects into the serialized dict.
|
| 212 |
+
중요: 런타임 객체가 직렬화 dict에 섞이면 안 됨.
|
| 213 |
"""
|
| 214 |
# ImageProcessingMixin.to_dict() adds metadata such as image_processor_type/auto_map.
|
| 215 |
# ImageProcessingMixin.to_dict()는 image_processor_type/auto_map 같은 메타를 추가합니다.
|
| 216 |
d = super().to_dict()
|
| 217 |
|
| 218 |
# Force minimal stable fields for long-term compatibility.
|
| 219 |
+
# 장기 호환을 위해 최소 안정 필드를 강제로 지정.
|
| 220 |
+
d["image_processor_type"] = self.__class__.__name__
|
| 221 |
d["backbone_name_or_path"] = self.backbone_name_or_path
|
| 222 |
d["is_training"] = self.is_training
|
| 223 |
+
d["use_fast"] = self.use_fast
|
| 224 |
|
| 225 |
# Remove any runtime-only fields defensively.
|
| 226 |
+
# 런타임 전용 필드는 보수적으로 제거.
|
| 227 |
for key in ["_meta", "_delegate", "_timm_transform", "_torchvision_transform"]:
|
| 228 |
d.pop(key, None)
|
| 229 |
|
|
|
|
| 233 |
def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs):
|
| 234 |
"""
|
| 235 |
Standard load path used by BaseImageProcessor / AutoImageProcessor.
|
| 236 |
+
BaseImageProcessor / AutoImageProcessor가 사용하는 표준 로드 경로임.
|
| 237 |
"""
|
| 238 |
backbone = image_processor_dict.get("backbone_name_or_path", None)
|
| 239 |
if backbone is None:
|
| 240 |
raise ValueError("preprocessor_config.json missing key: backbone_name_or_path")
|
| 241 |
|
| 242 |
is_training = bool(image_processor_dict.get("is_training", False))
|
| 243 |
+
use_fast = bool(image_processor_dict.get("use_fast", False))
|
| 244 |
|
| 245 |
return cls(
|
| 246 |
backbone_name_or_path=backbone,
|
|
|
|
| 253 |
def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
|
| 254 |
"""
|
| 255 |
Fallback path if AutoImageProcessor calls class.from_pretrained directly.
|
| 256 |
+
AutoImageProcessor가 class.from_pretrained를 직접 호출하는 경우를 대비한 메서드.
|
| 257 |
|
| 258 |
Strategy:
|
| 259 |
전략:
|
| 260 |
|
| 261 |
- Read config.json via AutoConfig and recover backbone_name_or_path.
|
| 262 |
+
AutoConfig로 config.json을 읽고 backbone_name_or_path를 복구.
|
| 263 |
"""
|
| 264 |
|
| 265 |
# is_training is runtime-only and should default to False for inference/serving.
|
| 266 |
+
# is_training은 런타임 전용이며 추론/서빙 기본값은 False 임.
|
| 267 |
#
|
| 268 |
# IMPORTANT:
|
| 269 |
+
# - use_fast는 kwargs로 전달될 수 있으므로, 있으면 반영.
|
| 270 |
use_fast = bool(kwargs.pop("use_fast", False))
|
| 271 |
|
| 272 |
kwargs.pop("trust_remote_code", None)
|
|
|
|
| 287 |
@staticmethod
|
| 288 |
def _ensure_list(images: Any) -> list[Any]:
|
| 289 |
# Normalize scalar image input to a list for uniform processing.
|
| 290 |
+
# 단일 입력을 리스트로 정규화하여 동일한 처리 경로를 사용.
|
| 291 |
if isinstance(images, (list, tuple)):
|
| 292 |
return list(images)
|
| 293 |
return [images]
|
|
|
|
| 295 |
@staticmethod
|
| 296 |
def _to_pil_rgb(x: Any):
|
| 297 |
# Convert common image inputs into PIL RGB images.
|
| 298 |
+
# 일반적인 입력을 PIL RGB 이미지로 변환.
|
| 299 |
from PIL import Image as PILImage
|
| 300 |
|
| 301 |
if isinstance(x, PILImage.Image):
|
|
|
|
| 312 |
) -> dict[str, Any]:
|
| 313 |
"""
|
| 314 |
Convert images into {"pixel_values": Tensor/ndarray}.
|
| 315 |
+
이미지를 {"pixel_values": Tensor/ndarray}로 변환.
|
| 316 |
"""
|
| 317 |
images = self._ensure_list(images)
|
| 318 |
|
| 319 |
# Rebuild runtime if needed (e.g., right after deserialization).
|
| 320 |
+
# 직렬화 복원 직후 등 런타임이 비어있을 수 있으므로 재구성.
|
| 321 |
if (self._delegate is None) and (self._timm_transform is None) and (self._torchvision_transform is None):
|
| 322 |
self._build_runtime()
|
| 323 |
|
| 324 |
# timm path: PIL -> torch.Tensor(C,H,W) normalized float32.
|
| 325 |
+
# timm 경로: PIL -> torch.Tensor(C,H,W) 정규화 float32.
|
| 326 |
if self._timm_transform is not None:
|
| 327 |
pv: list[torch.Tensor] = []
|
| 328 |
for im in images:
|
|
|
|
| 335 |
return self._format_return(pixel_values, return_tensors)
|
| 336 |
|
| 337 |
# torchvision path: PIL -> torch.Tensor(C,H,W) normalized float32.
|
| 338 |
+
# torchvision 경로: PIL -> torch.Tensor(C,H,W) 정규화 float32.
|
| 339 |
if self._torchvision_transform is not None:
|
| 340 |
pv: list[torch.Tensor] = []
|
| 341 |
for im in images:
|
|
|
|
| 348 |
return self._format_return(pixel_values, return_tensors)
|
| 349 |
|
| 350 |
# transformers delegate path: rely on official processor behavior.
|
| 351 |
+
# transformers 위임 경로: 공식 processor 동작을 그대로 사용.
|
| 352 |
if self._delegate is None:
|
| 353 |
raise RuntimeError("Processor runtime not built: delegate is None and no transforms are available.")
|
| 354 |
|
|
|
|
| 358 |
def _format_return(pixel_values: torch.Tensor, return_tensors: str | TensorType | None) -> dict[str, Any]:
|
| 359 |
"""
|
| 360 |
Format pixel_values according to return_tensors.
|
| 361 |
+
return_tensors에 맞춰 pixel_values 반환 포맷을 변환.
|
| 362 |
"""
|
| 363 |
if return_tensors is None or return_tensors in ("pt", TensorType.PYTORCH):
|
| 364 |
return {"pixel_values": pixel_values}
|
|
|
|
| 368 |
|
| 369 |
|
| 370 |
# Register this processor for AutoImageProcessor resolution.
|
| 371 |
+
# AutoImageProcessor 해석을 위해 이 processor를 등록.
|
| 372 |
if __name__ != "__main__":
|
| 373 |
BackboneMLPHead224ImageProcessor.register_for_auto_class("AutoImageProcessor")
|
models/timm__densenet121.tv_in1k/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 29293620
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0108b6737a776209aa44a8119bbb5034c69205bcd45dee8c5f0e3218eea1cf3a
|
| 3 |
size 29293620
|
models/torchvision__densenet121/config.json
CHANGED
|
@@ -24,7 +24,7 @@
|
|
| 24 |
"num_labels": 3,
|
| 25 |
"transformers_version": "5.1.0",
|
| 26 |
"ds_provenance": {
|
| 27 |
-
"created_at": "
|
| 28 |
"repo_id": "dsaint31/bb_mlp_224",
|
| 29 |
"subdir": "models/torchvision__densenet121",
|
| 30 |
"wrapper_class": "BackboneWithMLPHeadForImageClassification",
|
|
|
|
| 24 |
"num_labels": 3,
|
| 25 |
"transformers_version": "5.1.0",
|
| 26 |
"ds_provenance": {
|
| 27 |
+
"created_at": "20260212_202546",
|
| 28 |
"repo_id": "dsaint31/bb_mlp_224",
|
| 29 |
"subdir": "models/torchvision__densenet121",
|
| 30 |
"wrapper_class": "BackboneWithMLPHeadForImageClassification",
|
models/torchvision__densenet121/ds_model.py
CHANGED
|
@@ -94,14 +94,14 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 94 |
|
| 95 |
def __init__(self, config: BackboneMLPHeadConfig):
|
| 96 |
# PreTrainedModel expects a config object and stores it internally.
|
| 97 |
-
# PreTrainedModel은 config 객체를 받아 내부에
|
| 98 |
super().__init__(config)
|
| 99 |
|
| 100 |
# Fail-fast: the model is not meant to be instantiated without a valid backbone id.
|
| 101 |
-
# fail-fast: 유효한 backbone id 없이 모델을 만드는 사용 시나리오는 허용하지
|
| 102 |
#
|
| 103 |
# Note: Transformers may create configs with no args, but models are conventionally created with configs.
|
| 104 |
-
# 참고: Transformers는 config 무인자 생성이 있을 수 있으나, 모델은 관례적으로 config를 받아
|
| 105 |
if config.backbone_name_or_path is None:
|
| 106 |
raise ValueError(
|
| 107 |
"config.backbone_name_or_path is None. "
|
|
@@ -109,10 +109,10 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 109 |
)
|
| 110 |
|
| 111 |
# Fail-fast: training/inference requires a positive number of labels.
|
| 112 |
-
# fail-fast: 학습/추론은 num_labels가 양수여야
|
| 113 |
#
|
| 114 |
# Config may exist in a minimal form for internal serialization paths, but the model should not.
|
| 115 |
-
# config는 내부 직렬화 경로에서 최소 형태로 존재할 수 있으나 모델은
|
| 116 |
if int(getattr(config, "num_labels", 0)) <= 0:
|
| 117 |
raise ValueError(
|
| 118 |
f"config.num_labels must be > 0, got {getattr(config, 'num_labels', None)}. "
|
|
@@ -120,17 +120,17 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 120 |
)
|
| 121 |
|
| 122 |
# Meta is a single source of truth for extraction and fine-tuning rules.
|
| 123 |
-
# meta는 feature 추출 및 미세조정 규칙의 단일
|
| 124 |
-
|
| 125 |
# Prefer config.backbone_meta to keep Hub runtime self-contained.
|
| 126 |
self._meta = _resolve_backbone_meta(config, fallback_table=BACKBONE_META)
|
| 127 |
|
| 128 |
# Backbone skeleton is always created without pretrained weights.
|
| 129 |
-
# backbone skeleton은 항상 pretrained weight 없이
|
| 130 |
self.backbone = self._build_backbone_skeleton(config.backbone_name_or_path)
|
| 131 |
|
| 132 |
# Head shape is driven by meta feat_dim and config.num_labels.
|
| 133 |
-
# head shape은 meta의 feat_dim과 config.num_labels로
|
| 134 |
self.classifier = MLPHead(
|
| 135 |
in_dim=int(self._meta["feat_dim"]),
|
| 136 |
num_labels=int(config.num_labels),
|
|
@@ -139,16 +139,20 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 139 |
)
|
| 140 |
|
| 141 |
# HF initialization hook, but we override init_weights to initialize head-only.
|
| 142 |
-
# HF 초기화 훅이지만 init_weights를 override하여 head만
|
| 143 |
self.post_init()
|
| 144 |
|
| 145 |
def init_weights(self):
|
| 146 |
"""
|
| 147 |
Initialize only the head to avoid touching the backbone skeleton.
|
| 148 |
-
backbone skeleton을 건드리지 않기 위해 head만
|
| 149 |
|
| 150 |
HF's default init may traverse the entire module tree, which is undesirable here.
|
| 151 |
-
HF 기본 init은 전체 모듈 트리를 순회할 수 있어
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
"""
|
| 153 |
if getattr(self, "classifier", None) is not None:
|
| 154 |
self.classifier.apply(self._init_weights)
|
|
@@ -160,7 +164,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 160 |
# ----------------------------
|
| 161 |
def _build_backbone_skeleton(self, backbone_id: str) -> nn.Module:
|
| 162 |
# Meta decides which loader path to use.
|
| 163 |
-
# meta가 어떤 로더 경로를 사용할지
|
| 164 |
meta = self._meta if backbone_id == self.config.backbone_name_or_path else BACKBONE_META.get(backbone_id)
|
| 165 |
if meta is None:
|
| 166 |
raise KeyError(f"Unknown backbone_id={backbone_id}. Provide backbone_meta in config or extend BACKBONE_META.")
|
|
@@ -174,14 +178,14 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 174 |
return self._build_torchvision_densenet_skeleton(backbone_id)
|
| 175 |
|
| 176 |
# For transformers backbones: build a random-weight skeleton from config only.
|
| 177 |
-
# transformers 백본: config로부터 랜덤 초기화 skeleton만
|
| 178 |
bb_cfg = AutoConfig.from_pretrained(backbone_id)
|
| 179 |
return AutoModel.from_config(bb_cfg)
|
| 180 |
|
| 181 |
@staticmethod
|
| 182 |
def _build_timm_densenet_skeleton(hf_repo_id: str) -> nn.Module:
|
| 183 |
# timm is an optional dependency and should be imported lazily.
|
| 184 |
-
# timm은 옵션 의존성이므로 지연 import
|
| 185 |
try:
|
| 186 |
import timm
|
| 187 |
except Exception as e:
|
|
@@ -190,7 +194,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 190 |
) from e
|
| 191 |
|
| 192 |
# Build structure only (pretrained=False) and remove classifier head (num_classes=0).
|
| 193 |
-
# 구조만 생성(pretrained=False)하고 분류기 head는 제거(num_classes=0)
|
| 194 |
return timm.create_model(
|
| 195 |
f"hf_hub:{hf_repo_id}",
|
| 196 |
pretrained=False,
|
|
@@ -200,12 +204,12 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 200 |
@staticmethod
|
| 201 |
def _build_torchvision_densenet_skeleton(model_id: str) -> nn.Module:
|
| 202 |
# This project intentionally supports only torchvision/densenet121 in the 224 whitelist.
|
| 203 |
-
# 이 프로젝트는 224 화이트리스트에서 torchvision/densenet121만 의도적으로
|
| 204 |
if model_id != "torchvision/densenet121":
|
| 205 |
raise ValueError(f"Unsupported torchvision DenseNet id (224 whitelist only): {model_id}")
|
| 206 |
|
| 207 |
# Build structure only (weights=None) to avoid implicit pretrained loading.
|
| 208 |
-
# implicit pretrained 로드를 피하기 위해 구조만 생성(weights=None)
|
| 209 |
m = tv_models.densenet121(weights=None)
|
| 210 |
return m
|
| 211 |
|
|
@@ -222,10 +226,10 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 222 |
):
|
| 223 |
"""
|
| 224 |
Fresh-start only: inject pretrained backbone weights into the skeleton.
|
| 225 |
-
fresh-start 전용: skeleton backbone에 pretrained 가중치를
|
| 226 |
|
| 227 |
Do NOT call this after from_pretrained() because it would overwrite checkpoint weights.
|
| 228 |
-
from_pretrained() 이후 호출하면 체크포인트 가중치를 덮어쓰므로
|
| 229 |
"""
|
| 230 |
bb = self.config.backbone_name_or_path
|
| 231 |
meta = self._meta
|
|
@@ -240,7 +244,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 240 |
return
|
| 241 |
|
| 242 |
# For transformers backbones, load a reference pretrained model and copy weights into our skeleton.
|
| 243 |
-
# transformers 백본은 reference pretrained 모델을 로드한 뒤 skeleton에 가중치를
|
| 244 |
ref = AutoModel.from_pretrained(
|
| 245 |
bb,
|
| 246 |
low_cpu_mem_usage=low_cpu_mem_usage,
|
|
@@ -248,18 +252,18 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 248 |
)
|
| 249 |
|
| 250 |
# strict=False is used to tolerate harmless key differences across minor versions.
|
| 251 |
-
# strict=False는 마이너 버전 차이로 인한 무해한 키 차이를 허용하기 위해
|
| 252 |
self.backbone.load_state_dict(ref.state_dict(), strict=False)
|
| 253 |
del ref
|
| 254 |
|
| 255 |
@torch.no_grad()
|
| 256 |
def _load_timm_pretrained_into_skeleton_(self, hf_repo_id: str):
|
| 257 |
# timm must be present for timm backbones.
|
| 258 |
-
# timm
|
| 259 |
import timm
|
| 260 |
|
| 261 |
# Create a pretrained reference model and copy its weights strictly.
|
| 262 |
-
# pretrained reference 모델을 만들고 가중치를 strict하게
|
| 263 |
ref = timm.create_model(
|
| 264 |
f"hf_hub:{hf_repo_id}",
|
| 265 |
pretrained=True,
|
|
@@ -272,12 +276,12 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 272 |
@torch.no_grad()
|
| 273 |
def _load_torchvision_pretrained_into_skeleton_(self, model_id: str):
|
| 274 |
# This project intentionally supports only torchvision/densenet121 in the 224 whitelist.
|
| 275 |
-
# 이 프로젝트는 224 화이트리스트에서 torchvision/densenet121만
|
| 276 |
if model_id != "torchvision/densenet121":
|
| 277 |
raise ValueError(f"Unsupported torchvision DenseNet id (224 whitelist only): {model_id}")
|
| 278 |
|
| 279 |
# Use torchvision's default pretrained weights for densenet121.
|
| 280 |
-
# torchvision의 densenet121 기본 pretrained weights를
|
| 281 |
ref = tv_models.densenet121(weights=tv_models.DenseNet121_Weights.DEFAULT).eval()
|
| 282 |
|
| 283 |
self.backbone.load_state_dict(ref.state_dict(), strict=True)
|
|
@@ -290,7 +294,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 290 |
@staticmethod
|
| 291 |
def _pool_or_gap(outputs) -> torch.Tensor:
|
| 292 |
# Some transformers vision CNNs provide pooler_output explicitly.
|
| 293 |
-
# 일부 transformers vision CNN은 pooler_output을 명시적으로
|
| 294 |
if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
|
| 295 |
x = outputs.pooler_output
|
| 296 |
if x.dim() == 2:
|
|
@@ -300,7 +304,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 300 |
raise RuntimeError(f"Unexpected pooler_output shape: {tuple(x.shape)}")
|
| 301 |
|
| 302 |
# Otherwise we expect a CNN-style last_hidden_state=(B,C,H,W) and apply GAP.
|
| 303 |
-
# 그렇지 않으면 CNN 스타일 last_hidden_state=(B,C,H,W)를 기대하고 GAP을
|
| 304 |
x = outputs.last_hidden_state
|
| 305 |
if x.dim() == 4:
|
| 306 |
return x.mean(dim=(2, 3))
|
|
@@ -312,29 +316,29 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 312 |
|
| 313 |
def _extract_features(self, outputs, pixel_values: Optional[torch.Tensor] = None) -> torch.Tensor:
|
| 314 |
# Feature rule is defined by BACKBONE_META and must remain stable across saves/loads.
|
| 315 |
-
# feature 규칙은 BACKBONE_META로 정의되며 저장/로드 간
|
| 316 |
rule = self._meta["feat_rule"]
|
| 317 |
|
| 318 |
if rule == "cls":
|
| 319 |
# ViT-style: use CLS token embedding from last_hidden_state.
|
| 320 |
-
# ViT 스타일: last_hidden_state에서 CLS 토큰 임베딩을
|
| 321 |
return outputs.last_hidden_state[:, 0, :]
|
| 322 |
|
| 323 |
if rule == "pool_or_mean":
|
| 324 |
# Swin-style: prefer pooler_output if present, else mean-pool over tokens.
|
| 325 |
-
# Swin 스타일: pooler_output이 있으면 우선 사용하고, 없으면 토큰 평균 풀링을
|
| 326 |
if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
|
| 327 |
return outputs.pooler_output
|
| 328 |
return outputs.last_hidden_state.mean(dim=1)
|
| 329 |
|
| 330 |
if rule == "pool_or_gap":
|
| 331 |
# CNN-style: use pooler_output if present, else GAP over spatial dims.
|
| 332 |
-
# CNN 스타일: pooler_output이 있으면 사용하고, 없으면 공간 차원 GAP을
|
| 333 |
return self._pool_or_gap(outputs)
|
| 334 |
|
| 335 |
if rule == "timm_gap":
|
| 336 |
# timm forward_features returns a feature map (B,C,H,W) which we GAP to (B,C).
|
| 337 |
-
# timm forward_features는 (B,C,H,W) feature map을 반환하며 이를 GAP으로 (B,C)로
|
| 338 |
if not isinstance(outputs, torch.Tensor):
|
| 339 |
raise TypeError(f"timm_gap expects Tensor features, got {type(outputs)}")
|
| 340 |
if outputs.dim() != 4:
|
|
@@ -343,7 +347,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 343 |
|
| 344 |
if rule == "torchvision_densenet_gap":
|
| 345 |
# torchvision DenseNet features are feature maps (B,C,H,W) and require GAP.
|
| 346 |
-
# torchvision DenseNet features는 (B,C,H,W) feature map이며 GAP이
|
| 347 |
if not isinstance(outputs, torch.Tensor):
|
| 348 |
raise TypeError(f"torchvision_densenet_gap expects Tensor, got {type(outputs)}")
|
| 349 |
if outputs.dim() != 4:
|
|
@@ -362,7 +366,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 362 |
**kwargs,
|
| 363 |
):
|
| 364 |
# Type decides the backbone forward path and output format.
|
| 365 |
-
# type이 backbone forward 경로 및 출력 포맷을
|
| 366 |
t = self._meta["type"]
|
| 367 |
|
| 368 |
if t == "timm_densenet":
|
|
@@ -394,7 +398,7 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 394 |
|
| 395 |
else:
|
| 396 |
# Transformers vision models are called with pixel_values and return ModelOutput.
|
| 397 |
-
# transformers vision 모델은 pixel_values로 호출되며 ModelOutput을
|
| 398 |
outputs = self.backbone(
|
| 399 |
pixel_values=pixel_values,
|
| 400 |
output_attentions=output_attentions,
|
|
@@ -407,13 +411,13 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 407 |
attentions = getattr(outputs, "attentions", None)
|
| 408 |
|
| 409 |
# Classifier consumes (B, feat_dim) and returns logits (B, num_labels).
|
| 410 |
-
# classifier는 (B, feat_dim)을 받아 logits (B, num_labels)를
|
| 411 |
logits = self.classifier(feats)
|
| 412 |
|
| 413 |
loss = None
|
| 414 |
if labels is not None:
|
| 415 |
# Cross entropy expects labels as class indices in [0, num_labels).
|
| 416 |
-
# cross entropy는 labels가 [0, num_labels) 범위의 class index이길
|
| 417 |
loss = F.cross_entropy(logits, labels)
|
| 418 |
|
| 419 |
if not return_dict:
|
|
@@ -434,14 +438,14 @@ class BackboneWithMLPHeadForImageClassification(PreTrainedModel):
|
|
| 434 |
# ============================================================
|
| 435 |
def _set_requires_grad(module: nn.Module, flag: bool):
|
| 436 |
# Toggle requires_grad for all parameters in a module.
|
| 437 |
-
# 모듈의 모든 파라미터에 대해 requires_grad를
|
| 438 |
for p in module.parameters():
|
| 439 |
p.requires_grad = flag
|
| 440 |
|
| 441 |
|
| 442 |
def set_bn_eval(module: nn.Module):
|
| 443 |
# Put BatchNorm layers into eval mode to freeze running stats.
|
| 444 |
-
# BatchNorm 레이어를 eval 모드로 두어 running stats를
|
| 445 |
for m in module.modules():
|
| 446 |
if isinstance(m, (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.SyncBatchNorm)):
|
| 447 |
m.eval()
|
|
@@ -449,7 +453,7 @@ def set_bn_eval(module: nn.Module):
|
|
| 449 |
|
| 450 |
def freeze_backbone(model: BackboneWithMLPHeadForImageClassification, freeze_bn: bool = True):
|
| 451 |
# Stage1: freeze backbone and train only the head.
|
| 452 |
-
# stage1: backbone을 freeze하고 head만
|
| 453 |
_set_requires_grad(model.backbone, False)
|
| 454 |
_set_requires_grad(model.classifier, True)
|
| 455 |
|
|
@@ -460,7 +464,7 @@ def freeze_backbone(model: BackboneWithMLPHeadForImageClassification, freeze_bn:
|
|
| 460 |
|
| 461 |
def finetune_train_mode(model: BackboneWithMLPHeadForImageClassification, keep_bn_eval: bool = True):
|
| 462 |
# Stage2: train mode, optionally keeping BN layers in eval for stability.
|
| 463 |
-
# stage2: train 모드로 두되 안정성을 위해 BN을 eval로 유지할 수
|
| 464 |
model.train()
|
| 465 |
meta = getattr(model, "_meta", None) or getattr(model.config, "backbone_meta", None)
|
| 466 |
if keep_bn_eval and meta.get("has_bn", False):
|
|
@@ -469,7 +473,7 @@ def finetune_train_mode(model: BackboneWithMLPHeadForImageClassification, keep_b
|
|
| 469 |
|
| 470 |
def trainable_summary(model: nn.Module):
|
| 471 |
# Print a compact summary of trainable parameters.
|
| 472 |
-
# 학습 가능 파라미터 요약을 간단히
|
| 473 |
total = sum(p.numel() for p in model.parameters())
|
| 474 |
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
| 475 |
ratio = trainable / total if total > 0 else 0.0
|
|
@@ -483,7 +487,7 @@ def unfreeze_last_stage(
|
|
| 483 |
keep_bn_eval: bool = True,
|
| 484 |
):
|
| 485 |
# This utility implements BACKBONE_META['unfreeze']=="last_n" across supported backbones.
|
| 486 |
-
# 이 유틸은 지원 백본들에 대해 BACKBONE_META['unfreeze']=="last_n"을
|
| 487 |
freeze_backbone(model, freeze_bn=keep_bn_eval)
|
| 488 |
|
| 489 |
n = int(last_n)
|
|
@@ -498,7 +502,7 @@ def unfreeze_last_stage(
|
|
| 498 |
|
| 499 |
if bb_type == "vit":
|
| 500 |
# ViT blocks live under backbone.encoder.layer in the transformers implementation.
|
| 501 |
-
# ViT 블록은 transformers 구현에서 backbone.encoder.layer 아래에
|
| 502 |
blocks = list(model.backbone.encoder.layer)
|
| 503 |
for blk in blocks[-n:]:
|
| 504 |
_set_requires_grad(blk, True)
|
|
@@ -506,7 +510,7 @@ def unfreeze_last_stage(
|
|
| 506 |
|
| 507 |
if bb_type == "swin":
|
| 508 |
# Swin blocks are nested by stages and blocks; we flatten and unfreeze last n blocks.
|
| 509 |
-
# Swin 블록은 stage와 block으로 중첩되어 있어 펼친 후 마지막 n개를 unfreeze
|
| 510 |
stages = list(model.backbone.encoder.layers)
|
| 511 |
blocks: List[nn.Module] = []
|
| 512 |
for st in stages:
|
|
@@ -517,7 +521,7 @@ def unfreeze_last_stage(
|
|
| 517 |
|
| 518 |
if bb_type == "resnet":
|
| 519 |
# ResNet uses layer1..layer4 stages; we unfreeze at block granularity.
|
| 520 |
-
# ResNet은 layer1..layer4 stage를 사용하며 block 단위로 unfreeze
|
| 521 |
bb = model.backbone
|
| 522 |
for name in ("layer1", "layer2", "layer3", "layer4"):
|
| 523 |
if not hasattr(bb, name):
|
|
@@ -538,7 +542,7 @@ def unfreeze_last_stage(
|
|
| 538 |
|
| 539 |
if bb_type == "efficientnet":
|
| 540 |
# EfficientNet in transformers exposes features; we unfreeze from the tail blocks.
|
| 541 |
-
# transformers EfficientNet은 features를 노출하며 뒤쪽 블록부터 unfreeze
|
| 542 |
bb = model.backbone
|
| 543 |
if not hasattr(bb, "features"):
|
| 544 |
raise RuntimeError("Unexpected EfficientNet structure: missing features")
|
|
@@ -556,7 +560,7 @@ def unfreeze_last_stage(
|
|
| 556 |
|
| 557 |
if bb_type in ("timm_densenet", "torchvision_densenet"):
|
| 558 |
# DenseNet exposes a .features module with named blocks; we unfreeze last n submodules.
|
| 559 |
-
# DenseNet은 .features 모듈에 블록들이 이름으로 존재하며 마지막 n개 서브모듈을 unfreeze
|
| 560 |
bb = model.backbone
|
| 561 |
if not hasattr(bb, "features"):
|
| 562 |
raise RuntimeError("Unexpected DenseNet: missing features")
|
|
@@ -575,7 +579,7 @@ def unfreeze_last_stage(
|
|
| 575 |
|
| 576 |
def _denselayers(db: nn.Module) -> List[nn.Module]:
|
| 577 |
# Dense blocks contain multiple DenseLayer children; we return them for fine-grained unfreezing.
|
| 578 |
-
# denseblock은 DenseLayer 자식들을 가지므로 세밀한 unfreeze를 위해 이를
|
| 579 |
return list(db.children())
|
| 580 |
|
| 581 |
blocks: List[nn.Module] = []
|
|
@@ -600,5 +604,5 @@ def unfreeze_last_stage(
|
|
| 600 |
# register
|
| 601 |
# -------------------------
|
| 602 |
# Register for AutoModelForImageClassification so from_pretrained can resolve this custom class.
|
| 603 |
-
# from_pretrained가 이 커스텀 클래스를 해석할 수 있도록 AutoModelForImageClassification에
|
| 604 |
BackboneWithMLPHeadForImageClassification.register_for_auto_class("AutoModelForImageClassification")
|
|
|
|
| 94 |
|
| 95 |
def __init__(self, config: BackboneMLPHeadConfig):
|
| 96 |
# PreTrainedModel expects a config object and stores it internally.
|
| 97 |
+
# PreTrainedModel은 config 객체를 받아 내부에 저장함.
|
| 98 |
super().__init__(config)
|
| 99 |
|
| 100 |
# Fail-fast: the model is not meant to be instantiated without a valid backbone id.
|
| 101 |
+
# fail-fast: 유효한 backbone id 없이 모델을 만드는 사용 시나리오는 허용하지 않음 - fast fail.
|
| 102 |
#
|
| 103 |
# Note: Transformers may create configs with no args, but models are conventionally created with configs.
|
| 104 |
+
# 참고: Transformers는 config 무인자 생성이 있을 수 있으나, 모델은 관례적으로 config를 받아 생성.
|
| 105 |
if config.backbone_name_or_path is None:
|
| 106 |
raise ValueError(
|
| 107 |
"config.backbone_name_or_path is None. "
|
|
|
|
| 109 |
)
|
| 110 |
|
| 111 |
# Fail-fast: training/inference requires a positive number of labels.
|
| 112 |
+
# fail-fast: 학습/추론은 num_labels가 양수여야 함.
|
| 113 |
#
|
| 114 |
# Config may exist in a minimal form for internal serialization paths, but the model should not.
|
| 115 |
+
# config는 내부 직렬화 경로에서 최소 형태로 존재할 수 있으나 모델은 해당 없음.
|
| 116 |
if int(getattr(config, "num_labels", 0)) <= 0:
|
| 117 |
raise ValueError(
|
| 118 |
f"config.num_labels must be > 0, got {getattr(config, 'num_labels', None)}. "
|
|
|
|
| 120 |
)
|
| 121 |
|
| 122 |
# Meta is a single source of truth for extraction and fine-tuning rules.
|
| 123 |
+
# meta는 feature 추출 및 미세조정 규칙의 단일 기준.
|
| 124 |
+
# Resolve backbone meta from config (preferred) or fallback table (for backward compatibility).
|
| 125 |
# Prefer config.backbone_meta to keep Hub runtime self-contained.
|
| 126 |
self._meta = _resolve_backbone_meta(config, fallback_table=BACKBONE_META)
|
| 127 |
|
| 128 |
# Backbone skeleton is always created without pretrained weights.
|
| 129 |
+
# backbone skeleton은 항상 pretrained weight 없이 생성.
|
| 130 |
self.backbone = self._build_backbone_skeleton(config.backbone_name_or_path)
|
| 131 |
|
| 132 |
# Head shape is driven by meta feat_dim and config.num_labels.
|
| 133 |
+
# head shape은 meta의 feat_dim과 config.num_labels로 결정.
|
| 134 |
self.classifier = MLPHead(
|
| 135 |
in_dim=int(self._meta["feat_dim"]),
|
| 136 |
num_labels=int(config.num_labels),
|
|
|
|
| 139 |
)
|
| 140 |
|
| 141 |
# HF initialization hook, but we override init_weights to initialize head-only.
|
| 142 |
+
# HF 초기화 훅이지만 init_weights를 override하여 head만 초기화하도록 변경.
|
| 143 |
self.post_init()
|
| 144 |
|
| 145 |
def init_weights(self):
|
| 146 |
"""
|
| 147 |
Initialize only the head to avoid touching the backbone skeleton.
|
| 148 |
+
backbone skeleton을 건드리지 않기 위해 head만 초기화.
|
| 149 |
|
| 150 |
HF's default init may traverse the entire module tree, which is undesirable here.
|
| 151 |
+
HF 기본 init은 전체 모듈 트리를 순회할 수 있어 여기서 그대로 사용하기 부적절.
|
| 152 |
+
|
| 153 |
+
초기 설계에서 __init__ 내부에서 backbone의 가중치 로드를 수행함(편리를 위해).
|
| 154 |
+
이 경우, HF의 post_init()으로 인해 해당 로드가 취소되는 경우가 존재(timm, torchvision 등의 백본).
|
| 155 |
+
때문에 이를 오버라이드 하여 classifier만 초기화 하도록 변경함.
|
| 156 |
"""
|
| 157 |
if getattr(self, "classifier", None) is not None:
|
| 158 |
self.classifier.apply(self._init_weights)
|
|
|
|
| 164 |
# ----------------------------
|
| 165 |
def _build_backbone_skeleton(self, backbone_id: str) -> nn.Module:
|
| 166 |
# Meta decides which loader path to use.
|
| 167 |
+
# meta가 어떤 로더 경로를 사용할지 결정.
|
| 168 |
meta = self._meta if backbone_id == self.config.backbone_name_or_path else BACKBONE_META.get(backbone_id)
|
| 169 |
if meta is None:
|
| 170 |
raise KeyError(f"Unknown backbone_id={backbone_id}. Provide backbone_meta in config or extend BACKBONE_META.")
|
|
|
|
| 178 |
return self._build_torchvision_densenet_skeleton(backbone_id)
|
| 179 |
|
| 180 |
# For transformers backbones: build a random-weight skeleton from config only.
|
| 181 |
+
# transformers 백본: config로부터 랜덤 초기화 skeleton만 생성.
|
| 182 |
bb_cfg = AutoConfig.from_pretrained(backbone_id)
|
| 183 |
return AutoModel.from_config(bb_cfg)
|
| 184 |
|
| 185 |
@staticmethod
|
| 186 |
def _build_timm_densenet_skeleton(hf_repo_id: str) -> nn.Module:
|
| 187 |
# timm is an optional dependency and should be imported lazily.
|
| 188 |
+
# timm은 옵션 의존성이므로 지연 import 수행.
|
| 189 |
try:
|
| 190 |
import timm
|
| 191 |
except Exception as e:
|
|
|
|
| 194 |
) from e
|
| 195 |
|
| 196 |
# Build structure only (pretrained=False) and remove classifier head (num_classes=0).
|
| 197 |
+
# 구조만 생성(pretrained=False)하고 분류기 head는 제거(num_classes=0).
|
| 198 |
return timm.create_model(
|
| 199 |
f"hf_hub:{hf_repo_id}",
|
| 200 |
pretrained=False,
|
|
|
|
| 204 |
@staticmethod
|
| 205 |
def _build_torchvision_densenet_skeleton(model_id: str) -> nn.Module:
|
| 206 |
# This project intentionally supports only torchvision/densenet121 in the 224 whitelist.
|
| 207 |
+
# 이 프로젝트는 224 화이트리스트에서 torchvision/densenet121만 의도적으로 지원.
|
| 208 |
if model_id != "torchvision/densenet121":
|
| 209 |
raise ValueError(f"Unsupported torchvision DenseNet id (224 whitelist only): {model_id}")
|
| 210 |
|
| 211 |
# Build structure only (weights=None) to avoid implicit pretrained loading.
|
| 212 |
+
# implicit pretrained 로드를 피하기 위해 구조만 생성(weights=None).
|
| 213 |
m = tv_models.densenet121(weights=None)
|
| 214 |
return m
|
| 215 |
|
|
|
|
| 226 |
):
|
| 227 |
"""
|
| 228 |
Fresh-start only: inject pretrained backbone weights into the skeleton.
|
| 229 |
+
fresh-start 전용: skeleton backbone에 pretrained 가중치를 주입.
|
| 230 |
|
| 231 |
Do NOT call this after from_pretrained() because it would overwrite checkpoint weights.
|
| 232 |
+
from_pretrained() 이후 호출하면 체크포인트 가중치를 덮어쓰므로 주의할 것.
|
| 233 |
"""
|
| 234 |
bb = self.config.backbone_name_or_path
|
| 235 |
meta = self._meta
|
|
|
|
| 244 |
return
|
| 245 |
|
| 246 |
# For transformers backbones, load a reference pretrained model and copy weights into our skeleton.
|
| 247 |
+
# transformers 백본은 reference pretrained 모델을 로드한 뒤 skeleton에 가중치를 복사.
|
| 248 |
ref = AutoModel.from_pretrained(
|
| 249 |
bb,
|
| 250 |
low_cpu_mem_usage=low_cpu_mem_usage,
|
|
|
|
| 252 |
)
|
| 253 |
|
| 254 |
# strict=False is used to tolerate harmless key differences across minor versions.
|
| 255 |
+
# strict=False는 마이너 버전 차이로 인한 무해한 키 차이를 허용하기 위해 사용.
|
| 256 |
self.backbone.load_state_dict(ref.state_dict(), strict=False)
|
| 257 |
del ref
|
| 258 |
|
| 259 |
@torch.no_grad()
|
| 260 |
def _load_timm_pretrained_into_skeleton_(self, hf_repo_id: str):
|
| 261 |
# timm must be present for timm backbones.
|
| 262 |
+
# timm 백본에��� timm 설치가 필요.
|
| 263 |
import timm
|
| 264 |
|
| 265 |
# Create a pretrained reference model and copy its weights strictly.
|
| 266 |
+
# pretrained reference 모델을 만들고 가중치를 strict하게 복사.
|
| 267 |
ref = timm.create_model(
|
| 268 |
f"hf_hub:{hf_repo_id}",
|
| 269 |
pretrained=True,
|
|
|
|
| 276 |
@torch.no_grad()
|
| 277 |
def _load_torchvision_pretrained_into_skeleton_(self, model_id: str):
|
| 278 |
# This project intentionally supports only torchvision/densenet121 in the 224 whitelist.
|
| 279 |
+
# 이 프로젝트는 224 화이트리스트에서 torchvision/densenet121만 지원.
|
| 280 |
if model_id != "torchvision/densenet121":
|
| 281 |
raise ValueError(f"Unsupported torchvision DenseNet id (224 whitelist only): {model_id}")
|
| 282 |
|
| 283 |
# Use torchvision's default pretrained weights for densenet121.
|
| 284 |
+
# torchvision의 densenet121 기본 pretrained weights를 사용.
|
| 285 |
ref = tv_models.densenet121(weights=tv_models.DenseNet121_Weights.DEFAULT).eval()
|
| 286 |
|
| 287 |
self.backbone.load_state_dict(ref.state_dict(), strict=True)
|
|
|
|
| 294 |
@staticmethod
|
| 295 |
def _pool_or_gap(outputs) -> torch.Tensor:
|
| 296 |
# Some transformers vision CNNs provide pooler_output explicitly.
|
| 297 |
+
# 일부 transformers vision CNN은 pooler_output을 명시적으로 제공.
|
| 298 |
if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
|
| 299 |
x = outputs.pooler_output
|
| 300 |
if x.dim() == 2:
|
|
|
|
| 304 |
raise RuntimeError(f"Unexpected pooler_output shape: {tuple(x.shape)}")
|
| 305 |
|
| 306 |
# Otherwise we expect a CNN-style last_hidden_state=(B,C,H,W) and apply GAP.
|
| 307 |
+
# 그렇지 않으면 CNN 스타일 last_hidden_state=(B,C,H,W)를 기대하고 GAP을 적용.
|
| 308 |
x = outputs.last_hidden_state
|
| 309 |
if x.dim() == 4:
|
| 310 |
return x.mean(dim=(2, 3))
|
|
|
|
| 316 |
|
| 317 |
def _extract_features(self, outputs, pixel_values: Optional[torch.Tensor] = None) -> torch.Tensor:
|
| 318 |
# Feature rule is defined by BACKBONE_META and must remain stable across saves/loads.
|
| 319 |
+
# feature 규칙은 BACKBONE_META로 정의되며 저장/로드 간 안정적 동작을 위해 제한된 모델만 사용.
|
| 320 |
rule = self._meta["feat_rule"]
|
| 321 |
|
| 322 |
if rule == "cls":
|
| 323 |
# ViT-style: use CLS token embedding from last_hidden_state.
|
| 324 |
+
# ViT 스타일: last_hidden_state에서 CLS 토큰 임베딩을 사용.
|
| 325 |
return outputs.last_hidden_state[:, 0, :]
|
| 326 |
|
| 327 |
if rule == "pool_or_mean":
|
| 328 |
# Swin-style: prefer pooler_output if present, else mean-pool over tokens.
|
| 329 |
+
# Swin 스타일: pooler_output이 있으면 우선 사용하고, 없으면 토큰 평균 풀링을 사용.
|
| 330 |
if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
|
| 331 |
return outputs.pooler_output
|
| 332 |
return outputs.last_hidden_state.mean(dim=1)
|
| 333 |
|
| 334 |
if rule == "pool_or_gap":
|
| 335 |
# CNN-style: use pooler_output if present, else GAP over spatial dims.
|
| 336 |
+
# CNN 스타일: pooler_output이 있으면 사용하고, 없으면 공간 차원 GAP을 사용.
|
| 337 |
return self._pool_or_gap(outputs)
|
| 338 |
|
| 339 |
if rule == "timm_gap":
|
| 340 |
# timm forward_features returns a feature map (B,C,H,W) which we GAP to (B,C).
|
| 341 |
+
# timm forward_features는 (B,C,H,W) feature map을 반환하며 이를 GAP으로 (B,C)로 변환.
|
| 342 |
if not isinstance(outputs, torch.Tensor):
|
| 343 |
raise TypeError(f"timm_gap expects Tensor features, got {type(outputs)}")
|
| 344 |
if outputs.dim() != 4:
|
|
|
|
| 347 |
|
| 348 |
if rule == "torchvision_densenet_gap":
|
| 349 |
# torchvision DenseNet features are feature maps (B,C,H,W) and require GAP.
|
| 350 |
+
# torchvision DenseNet features는 (B,C,H,W) feature map이며 GAP이 필요.
|
| 351 |
if not isinstance(outputs, torch.Tensor):
|
| 352 |
raise TypeError(f"torchvision_densenet_gap expects Tensor, got {type(outputs)}")
|
| 353 |
if outputs.dim() != 4:
|
|
|
|
| 366 |
**kwargs,
|
| 367 |
):
|
| 368 |
# Type decides the backbone forward path and output format.
|
| 369 |
+
# type이 backbone forward 경로 및 출력 포맷을 결정.
|
| 370 |
t = self._meta["type"]
|
| 371 |
|
| 372 |
if t == "timm_densenet":
|
|
|
|
| 398 |
|
| 399 |
else:
|
| 400 |
# Transformers vision models are called with pixel_values and return ModelOutput.
|
| 401 |
+
# transformers vision 모델은 pixel_values로 호출되며 ModelOutput을 반환.
|
| 402 |
outputs = self.backbone(
|
| 403 |
pixel_values=pixel_values,
|
| 404 |
output_attentions=output_attentions,
|
|
|
|
| 411 |
attentions = getattr(outputs, "attentions", None)
|
| 412 |
|
| 413 |
# Classifier consumes (B, feat_dim) and returns logits (B, num_labels).
|
| 414 |
+
# classifier는 (B, feat_dim)을 받아 logits (B, num_labels)를 반환.
|
| 415 |
logits = self.classifier(feats)
|
| 416 |
|
| 417 |
loss = None
|
| 418 |
if labels is not None:
|
| 419 |
# Cross entropy expects labels as class indices in [0, num_labels).
|
| 420 |
+
# cross entropy는 labels가 [0, num_labels) 범위의 class index이길 기대함.
|
| 421 |
loss = F.cross_entropy(logits, labels)
|
| 422 |
|
| 423 |
if not return_dict:
|
|
|
|
| 438 |
# ============================================================
|
| 439 |
def _set_requires_grad(module: nn.Module, flag: bool):
|
| 440 |
# Toggle requires_grad for all parameters in a module.
|
| 441 |
+
# 모듈의 모든 파라미터에 대해 requires_grad를 토글.
|
| 442 |
for p in module.parameters():
|
| 443 |
p.requires_grad = flag
|
| 444 |
|
| 445 |
|
| 446 |
def set_bn_eval(module: nn.Module):
|
| 447 |
# Put BatchNorm layers into eval mode to freeze running stats.
|
| 448 |
+
# BatchNorm 레이어를 eval 모드로 두어 running stats를 고정.
|
| 449 |
for m in module.modules():
|
| 450 |
if isinstance(m, (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.SyncBatchNorm)):
|
| 451 |
m.eval()
|
|
|
|
| 453 |
|
| 454 |
def freeze_backbone(model: BackboneWithMLPHeadForImageClassification, freeze_bn: bool = True):
|
| 455 |
# Stage1: freeze backbone and train only the head.
|
| 456 |
+
# stage1: backbone을 freeze하고 head만 학습.
|
| 457 |
_set_requires_grad(model.backbone, False)
|
| 458 |
_set_requires_grad(model.classifier, True)
|
| 459 |
|
|
|
|
| 464 |
|
| 465 |
def finetune_train_mode(model: BackboneWithMLPHeadForImageClassification, keep_bn_eval: bool = True):
|
| 466 |
# Stage2: train mode, optionally keeping BN layers in eval for stability.
|
| 467 |
+
# stage2: train 모드로 두되 안정성을 위해 BN을 eval로 유지할 수 있음. (buffer 등을 유지하기 위해)
|
| 468 |
model.train()
|
| 469 |
meta = getattr(model, "_meta", None) or getattr(model.config, "backbone_meta", None)
|
| 470 |
if keep_bn_eval and meta.get("has_bn", False):
|
|
|
|
| 473 |
|
| 474 |
def trainable_summary(model: nn.Module):
|
| 475 |
# Print a compact summary of trainable parameters.
|
| 476 |
+
# 학습 가능 파라미터 요약을 간단히 출력.
|
| 477 |
total = sum(p.numel() for p in model.parameters())
|
| 478 |
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
| 479 |
ratio = trainable / total if total > 0 else 0.0
|
|
|
|
| 487 |
keep_bn_eval: bool = True,
|
| 488 |
):
|
| 489 |
# This utility implements BACKBONE_META['unfreeze']=="last_n" across supported backbones.
|
| 490 |
+
# 이 유틸은 지원 백본들에 대해 BACKBONE_META['unfreeze']=="last_n"을 구현.
|
| 491 |
freeze_backbone(model, freeze_bn=keep_bn_eval)
|
| 492 |
|
| 493 |
n = int(last_n)
|
|
|
|
| 502 |
|
| 503 |
if bb_type == "vit":
|
| 504 |
# ViT blocks live under backbone.encoder.layer in the transformers implementation.
|
| 505 |
+
# ViT 블록은 transformers 구현에서 backbone.encoder.layer 아래에 존재함.
|
| 506 |
blocks = list(model.backbone.encoder.layer)
|
| 507 |
for blk in blocks[-n:]:
|
| 508 |
_set_requires_grad(blk, True)
|
|
|
|
| 510 |
|
| 511 |
if bb_type == "swin":
|
| 512 |
# Swin blocks are nested by stages and blocks; we flatten and unfreeze last n blocks.
|
| 513 |
+
# Swin 블록은 stage와 block으로 중첩되어 있어 펼친 후 마지막 n개를 unfreeze.
|
| 514 |
stages = list(model.backbone.encoder.layers)
|
| 515 |
blocks: List[nn.Module] = []
|
| 516 |
for st in stages:
|
|
|
|
| 521 |
|
| 522 |
if bb_type == "resnet":
|
| 523 |
# ResNet uses layer1..layer4 stages; we unfreeze at block granularity.
|
| 524 |
+
# ResNet은 layer1..layer4 stage를 사용하며 block 단위로 unfreeze.
|
| 525 |
bb = model.backbone
|
| 526 |
for name in ("layer1", "layer2", "layer3", "layer4"):
|
| 527 |
if not hasattr(bb, name):
|
|
|
|
| 542 |
|
| 543 |
if bb_type == "efficientnet":
|
| 544 |
# EfficientNet in transformers exposes features; we unfreeze from the tail blocks.
|
| 545 |
+
# transformers EfficientNet은 features를 노출하며 뒤쪽 블록부터 unfreeze.
|
| 546 |
bb = model.backbone
|
| 547 |
if not hasattr(bb, "features"):
|
| 548 |
raise RuntimeError("Unexpected EfficientNet structure: missing features")
|
|
|
|
| 560 |
|
| 561 |
if bb_type in ("timm_densenet", "torchvision_densenet"):
|
| 562 |
# DenseNet exposes a .features module with named blocks; we unfreeze last n submodules.
|
| 563 |
+
# DenseNet은 .features 모듈에 블록들이 이름으로 존재하며 마지막 n개 서브모듈을 unfreeze.
|
| 564 |
bb = model.backbone
|
| 565 |
if not hasattr(bb, "features"):
|
| 566 |
raise RuntimeError("Unexpected DenseNet: missing features")
|
|
|
|
| 579 |
|
| 580 |
def _denselayers(db: nn.Module) -> List[nn.Module]:
|
| 581 |
# Dense blocks contain multiple DenseLayer children; we return them for fine-grained unfreezing.
|
| 582 |
+
# denseblock은 DenseLayer 자식들을 가지므로 세밀한 unfreeze를 위해 이를 반환.
|
| 583 |
return list(db.children())
|
| 584 |
|
| 585 |
blocks: List[nn.Module] = []
|
|
|
|
| 604 |
# register
|
| 605 |
# -------------------------
|
| 606 |
# Register for AutoModelForImageClassification so from_pretrained can resolve this custom class.
|
| 607 |
+
# from_pretrained가 이 커스텀 클래스를 해석할 수 있도록 AutoModelForImageClassification에 등록.
|
| 608 |
BackboneWithMLPHeadForImageClassification.register_for_auto_class("AutoModelForImageClassification")
|
models/torchvision__densenet121/ds_proc.py
CHANGED
|
@@ -4,8 +4,8 @@
|
|
| 4 |
# src/ds_proc.py
|
| 5 |
|
| 6 |
# ============================================================
|
| 7 |
-
#
|
| 8 |
-
#
|
| 9 |
# ============================================================
|
| 10 |
|
| 11 |
from typing import Any
|
|
@@ -27,41 +27,38 @@ except ImportError:
|
|
| 27 |
class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
| 28 |
"""
|
| 29 |
This processor performs image preprocessing and outputs {"pixel_values": ...}.
|
| 30 |
-
이 processor는 이미지 전처리를 수행하고 {"pixel_values": ...}를
|
| 31 |
|
| 32 |
Key requirements:
|
| 33 |
핵심 요구사항:
|
| 34 |
|
| 35 |
1) save_pretrained() must produce a JSON-serializable preprocessor_config.json.
|
| 36 |
-
save_pretrained()는 JSON 직렬화 가능한 preprocessor_config.json을 생성해야
|
| 37 |
-
|
| 38 |
2) Runtime-only objects (delegate processor, timm/torchvision transforms) must NOT be serialized.
|
| 39 |
-
런타임 객체(delegate processor, timm/torchvision transform)는 절대 직렬화하면 안
|
| 40 |
-
|
| 41 |
3) Runtime objects are rebuilt at init/load time based on backbone meta.
|
| 42 |
-
런타임 객체는 backbone meta에 따라 init/load 시점에
|
| 43 |
-
|
| 44 |
4) For reproducibility, use_fast must be explicitly persisted and honored on load.
|
| 45 |
-
재현성을 위해 use_fast는 명시적으로 저장되고, 로드시 반드시 반영되어야
|
| 46 |
"""
|
| 47 |
|
| 48 |
# HF vision models conventionally expect "pixel_values" as the primary input key.
|
| 49 |
-
# HF vision 모델은 관례적으로 입력 키로 "pixel_values"를
|
| 50 |
model_input_names = ["pixel_values"]
|
| 51 |
|
| 52 |
def __init__(
|
| 53 |
self,
|
| 54 |
backbone_name_or_path: BackboneID,
|
| 55 |
-
is_training: bool = False,
|
| 56 |
use_fast: bool = False,
|
| 57 |
**kwargs,
|
| 58 |
):
|
| 59 |
# ImageProcessingMixin stores extra kwargs and manages auto_map metadata.
|
| 60 |
-
# ImageProcessingMixin은 추가 kwargs를 저장하고 auto_map 메타를
|
| 61 |
super().__init__(**kwargs)
|
| 62 |
|
| 63 |
# Enforce whitelist via BACKBONE_META to keep behavior stable.
|
| 64 |
-
# 동작 안정성을 위해 BACKBONE_META 기반 화이트리스트를
|
| 65 |
if backbone_name_or_path not in BACKBONE_META:
|
| 66 |
raise ValueError(
|
| 67 |
f"Unsupported backbone_name_or_path={backbone_name_or_path}. "
|
|
@@ -69,23 +66,23 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 69 |
)
|
| 70 |
|
| 71 |
# Serializable fields only: these should appear in preprocessor_config.json.
|
| 72 |
-
# 직렬화 가능한 필드만: 이 값들만 preprocessor_config.json에 들어가야
|
| 73 |
self.backbone_name_or_path = backbone_name_or_path
|
| 74 |
self.is_training = bool(is_training)
|
| 75 |
|
| 76 |
# Reproducibility switch for transformers processors.
|
| 77 |
-
# transformers processor의 fast/slow 선택을 재현 가능하게
|
| 78 |
self.use_fast = bool(use_fast)
|
| 79 |
|
| 80 |
# Runtime-only fields: must never be serialized.
|
| 81 |
-
# 런타임 전용 필드: 절대 직렬화되면 안
|
| 82 |
self._meta = None
|
| 83 |
-
self._delegate
|
| 84 |
-
self._timm_transform
|
| 85 |
self._torchvision_transform = None
|
| 86 |
|
| 87 |
# Build runtime objects according to backbone type.
|
| 88 |
-
# backbone type에 따라 런타임 객체를
|
| 89 |
self._build_runtime()
|
| 90 |
|
| 91 |
# ============================================================
|
|
@@ -95,13 +92,13 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 95 |
def _build_runtime(self):
|
| 96 |
"""
|
| 97 |
Build runtime delegate/transform based on BACKBONE_META["type"].
|
| 98 |
-
BACKBONE_META["type"]에 따라 런타임 delegate/transform을
|
| 99 |
"""
|
| 100 |
meta = BACKBONE_META[self.backbone_name_or_path]
|
| 101 |
self._meta = meta
|
| 102 |
|
| 103 |
# Always reset runtime fields before rebuilding.
|
| 104 |
-
# 재구성 전 런타임 필드는 항상
|
| 105 |
self._delegate = None
|
| 106 |
self._timm_transform = None
|
| 107 |
self._torchvision_transform = None
|
|
@@ -110,7 +107,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 110 |
|
| 111 |
if t == "timm_densenet":
|
| 112 |
# timm DenseNet uses timm.data transforms for ImageNet-style preprocessing.
|
| 113 |
-
# timm DenseNet은 ImageNet 전처리를 위해 timm.data transform을
|
| 114 |
self._timm_transform = self._build_timm_transform(
|
| 115 |
backbone_id=self.backbone_name_or_path,
|
| 116 |
is_training=self.is_training,
|
|
@@ -119,17 +116,17 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 119 |
|
| 120 |
if t == "torchvision_densenet":
|
| 121 |
# torchvision DenseNet requires torchvision-style preprocessing (resize/crop/tensor/normalize).
|
| 122 |
-
# torchvision DenseNet은 torchvision 스타일 전처리(resize/crop/tensor/normalize)가
|
| 123 |
self._torchvision_transform = self._build_torchvision_densenet_transform(
|
| 124 |
is_training=self.is_training
|
| 125 |
)
|
| 126 |
return
|
| 127 |
|
| 128 |
# Default: transformers backbone delegates to its official AutoImageProcessor.
|
| 129 |
-
# 기본: transformers 백본은 공식 AutoImageProcessor에
|
| 130 |
#
|
| 131 |
# IMPORTANT:
|
| 132 |
-
# - use_fast는 transformers 기본값 변경에 흔들리지 않도록 반드시 명시적으로
|
| 133 |
self._delegate = AutoImageProcessor.from_pretrained(
|
| 134 |
self.backbone_name_or_path,
|
| 135 |
use_fast=self.use_fast,
|
|
@@ -140,7 +137,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 140 |
def _build_timm_transform(*, backbone_id: str, is_training: bool):
|
| 141 |
"""
|
| 142 |
Create timm transform without storing non-serializable objects in config.
|
| 143 |
-
비직렬화 객체를 config에 저장하지 않고 timm transform을
|
| 144 |
"""
|
| 145 |
try:
|
| 146 |
import timm
|
|
@@ -151,20 +148,20 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 151 |
) from e
|
| 152 |
|
| 153 |
# We only need model metadata to resolve data config, so pretrained=False is preferred.
|
| 154 |
-
# data config 추출만 필요하므로 pretrained=False를 우선
|
| 155 |
m = timm.create_model(f"hf_hub:{backbone_id}", pretrained=False, num_classes=0)
|
| 156 |
dc = resolve_model_data_config(m)
|
| 157 |
|
| 158 |
# create_transform returns a torchvision-like callable that maps PIL -> torch.Tensor(C,H,W).
|
| 159 |
-
# create_transform은 PIL -> torch.Tensor(C,H,W)로 매핑하는 callable을
|
| 160 |
-
tfm = create_transform(**dc, is_training=is_training)
|
| 161 |
return tfm
|
| 162 |
|
| 163 |
@staticmethod
|
| 164 |
def _build_torchvision_densenet_transform(*, is_training: bool):
|
| 165 |
"""
|
| 166 |
Build torchvision preprocessing for DenseNet-121 (224 pipeline).
|
| 167 |
-
DenseNet-121용 torchvision 전처리(224 파이프라인)를
|
| 168 |
"""
|
| 169 |
try:
|
| 170 |
from torchvision import transforms
|
|
@@ -174,28 +171,29 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 174 |
) from e
|
| 175 |
|
| 176 |
# These are the standard ImageNet normalization stats used by torchvision weights.
|
| 177 |
-
# 이 값들은 torchvision weights가 사용하는 표준 ImageNet 정규화
|
| 178 |
mean = (0.485, 0.456, 0.406)
|
| 179 |
-
std
|
| 180 |
|
| 181 |
# Training pipeline typically uses RandomResizedCrop and horizontal flip.
|
| 182 |
-
# 학습 파이프라인은 보통 RandomResizedCrop과 좌우반전을
|
| 183 |
if is_training:
|
| 184 |
return transforms.Compose(
|
| 185 |
[
|
| 186 |
-
transforms.RandomResizedCrop(224),
|
| 187 |
-
transforms.RandomHorizontalFlip(p=0.5),
|
|
|
|
| 188 |
transforms.ToTensor(),
|
| 189 |
transforms.Normalize(mean=mean, std=std),
|
| 190 |
]
|
| 191 |
)
|
| 192 |
|
| 193 |
# Inference pipeline typically uses Resize(256) + CenterCrop(224).
|
| 194 |
-
# 추론 파이프라인은 보통 Resize(256) + CenterCrop(224)를
|
| 195 |
return transforms.Compose(
|
| 196 |
[
|
| 197 |
transforms.Resize(256),
|
| 198 |
-
transforms.CenterCrop(224),
|
| 199 |
transforms.ToTensor(),
|
| 200 |
transforms.Normalize(mean=mean, std=std),
|
| 201 |
]
|
|
@@ -208,24 +206,24 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 208 |
def to_dict(self) -> dict[str, Any]:
|
| 209 |
"""
|
| 210 |
Return a JSON-serializable dict for preprocessor_config.json.
|
| 211 |
-
preprocessor_config.json에 들어갈 JSON 직렬화 dict를
|
| 212 |
|
| 213 |
Important: do not leak runtime objects into the serialized dict.
|
| 214 |
-
중요: 런타임 객체가 직렬화 dict에 섞이면 안
|
| 215 |
"""
|
| 216 |
# ImageProcessingMixin.to_dict() adds metadata such as image_processor_type/auto_map.
|
| 217 |
# ImageProcessingMixin.to_dict()는 image_processor_type/auto_map 같은 메타를 추가합니다.
|
| 218 |
d = super().to_dict()
|
| 219 |
|
| 220 |
# Force minimal stable fields for long-term compatibility.
|
| 221 |
-
# 장기 호환을 위해 최소 안정 필드를
|
| 222 |
-
d["image_processor_type"]
|
| 223 |
d["backbone_name_or_path"] = self.backbone_name_or_path
|
| 224 |
d["is_training"] = self.is_training
|
| 225 |
-
d["use_fast"]
|
| 226 |
|
| 227 |
# Remove any runtime-only fields defensively.
|
| 228 |
-
# 런타임 전용 필드는 보수적으로
|
| 229 |
for key in ["_meta", "_delegate", "_timm_transform", "_torchvision_transform"]:
|
| 230 |
d.pop(key, None)
|
| 231 |
|
|
@@ -235,14 +233,14 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 235 |
def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs):
|
| 236 |
"""
|
| 237 |
Standard load path used by BaseImageProcessor / AutoImageProcessor.
|
| 238 |
-
BaseImageProcessor / AutoImageProcessor가 사용하는 표준 로드
|
| 239 |
"""
|
| 240 |
backbone = image_processor_dict.get("backbone_name_or_path", None)
|
| 241 |
if backbone is None:
|
| 242 |
raise ValueError("preprocessor_config.json missing key: backbone_name_or_path")
|
| 243 |
|
| 244 |
is_training = bool(image_processor_dict.get("is_training", False))
|
| 245 |
-
use_fast
|
| 246 |
|
| 247 |
return cls(
|
| 248 |
backbone_name_or_path=backbone,
|
|
@@ -255,20 +253,20 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 255 |
def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
|
| 256 |
"""
|
| 257 |
Fallback path if AutoImageProcessor calls class.from_pretrained directly.
|
| 258 |
-
AutoImageProcessor가 class.from_pretrained를 직접 호출하는 경우를 대비한
|
| 259 |
|
| 260 |
Strategy:
|
| 261 |
전략:
|
| 262 |
|
| 263 |
- Read config.json via AutoConfig and recover backbone_name_or_path.
|
| 264 |
-
AutoConfig로 config.json을 읽고 backbone_name_or_path를
|
| 265 |
"""
|
| 266 |
|
| 267 |
# is_training is runtime-only and should default to False for inference/serving.
|
| 268 |
-
# is_training은 런타임 전용이며 추론/서빙 기본값은 False
|
| 269 |
#
|
| 270 |
# IMPORTANT:
|
| 271 |
-
# - use_fast는 kwargs로 전달될 수 있으므로, 있으면
|
| 272 |
use_fast = bool(kwargs.pop("use_fast", False))
|
| 273 |
|
| 274 |
kwargs.pop("trust_remote_code", None)
|
|
@@ -289,7 +287,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 289 |
@staticmethod
|
| 290 |
def _ensure_list(images: Any) -> list[Any]:
|
| 291 |
# Normalize scalar image input to a list for uniform processing.
|
| 292 |
-
# 단일 입력을 리스트로 정규화하여 동일한 처리 경로를
|
| 293 |
if isinstance(images, (list, tuple)):
|
| 294 |
return list(images)
|
| 295 |
return [images]
|
|
@@ -297,7 +295,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 297 |
@staticmethod
|
| 298 |
def _to_pil_rgb(x: Any):
|
| 299 |
# Convert common image inputs into PIL RGB images.
|
| 300 |
-
# 일반적인 입력을 PIL RGB 이미지로
|
| 301 |
from PIL import Image as PILImage
|
| 302 |
|
| 303 |
if isinstance(x, PILImage.Image):
|
|
@@ -314,17 +312,17 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 314 |
) -> dict[str, Any]:
|
| 315 |
"""
|
| 316 |
Convert images into {"pixel_values": Tensor/ndarray}.
|
| 317 |
-
이미지를 {"pixel_values": Tensor/ndarray}로
|
| 318 |
"""
|
| 319 |
images = self._ensure_list(images)
|
| 320 |
|
| 321 |
# Rebuild runtime if needed (e.g., right after deserialization).
|
| 322 |
-
# 직렬화 복원 직후 등 런타임이 비어있을 수 있으므로
|
| 323 |
if (self._delegate is None) and (self._timm_transform is None) and (self._torchvision_transform is None):
|
| 324 |
self._build_runtime()
|
| 325 |
|
| 326 |
# timm path: PIL -> torch.Tensor(C,H,W) normalized float32.
|
| 327 |
-
# timm 경로: PIL -> torch.Tensor(C,H,W) 정규화 float32
|
| 328 |
if self._timm_transform is not None:
|
| 329 |
pv: list[torch.Tensor] = []
|
| 330 |
for im in images:
|
|
@@ -337,7 +335,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 337 |
return self._format_return(pixel_values, return_tensors)
|
| 338 |
|
| 339 |
# torchvision path: PIL -> torch.Tensor(C,H,W) normalized float32.
|
| 340 |
-
# torchvision 경로: PIL -> torch.Tensor(C,H,W) 정규화 float32
|
| 341 |
if self._torchvision_transform is not None:
|
| 342 |
pv: list[torch.Tensor] = []
|
| 343 |
for im in images:
|
|
@@ -350,7 +348,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 350 |
return self._format_return(pixel_values, return_tensors)
|
| 351 |
|
| 352 |
# transformers delegate path: rely on official processor behavior.
|
| 353 |
-
# transformers 위임 경로: 공식 processor 동작을 그대로
|
| 354 |
if self._delegate is None:
|
| 355 |
raise RuntimeError("Processor runtime not built: delegate is None and no transforms are available.")
|
| 356 |
|
|
@@ -360,7 +358,7 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 360 |
def _format_return(pixel_values: torch.Tensor, return_tensors: str | TensorType | None) -> dict[str, Any]:
|
| 361 |
"""
|
| 362 |
Format pixel_values according to return_tensors.
|
| 363 |
-
return_tensors에 맞춰 pixel_values 반환 포맷을
|
| 364 |
"""
|
| 365 |
if return_tensors is None or return_tensors in ("pt", TensorType.PYTORCH):
|
| 366 |
return {"pixel_values": pixel_values}
|
|
@@ -370,6 +368,6 @@ class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
|
| 370 |
|
| 371 |
|
| 372 |
# Register this processor for AutoImageProcessor resolution.
|
| 373 |
-
# AutoImageProcessor 해석을 위해 이 processor를
|
| 374 |
if __name__ != "__main__":
|
| 375 |
BackboneMLPHead224ImageProcessor.register_for_auto_class("AutoImageProcessor")
|
|
|
|
| 4 |
# src/ds_proc.py
|
| 5 |
|
| 6 |
# ============================================================
|
| 7 |
+
# ImageProcessor (AutoImageProcessor integration)
|
| 8 |
+
# ImageProcessor (AutoImageProcessor 연동)
|
| 9 |
# ============================================================
|
| 10 |
|
| 11 |
from typing import Any
|
|
|
|
| 27 |
class BackboneMLPHead224ImageProcessor(ImageProcessingMixin):
|
| 28 |
"""
|
| 29 |
This processor performs image preprocessing and outputs {"pixel_values": ...}.
|
| 30 |
+
이 processor는 이미지 전처리를 수행하고 {"pixel_values": ...}를 반환함.
|
| 31 |
|
| 32 |
Key requirements:
|
| 33 |
핵심 요구사항:
|
| 34 |
|
| 35 |
1) save_pretrained() must produce a JSON-serializable preprocessor_config.json.
|
| 36 |
+
save_pretrained()는 JSON 직렬화 가능한 preprocessor_config.json을 생성해야 함.
|
|
|
|
| 37 |
2) Runtime-only objects (delegate processor, timm/torchvision transforms) must NOT be serialized.
|
| 38 |
+
런타임 객체(delegate processor, timm/torchvision transform)는 절대 직렬화하면 안 됨.
|
|
|
|
| 39 |
3) Runtime objects are rebuilt at init/load time based on backbone meta.
|
| 40 |
+
런타임 객체는 backbone meta에 따라 init/load 시점에 재구성.
|
|
|
|
| 41 |
4) For reproducibility, use_fast must be explicitly persisted and honored on load.
|
| 42 |
+
재현성을 위해 use_fast는 명시적으로 저장되고, 로드시 반드시 반영되어야 함.
|
| 43 |
"""
|
| 44 |
|
| 45 |
# HF vision models conventionally expect "pixel_values" as the primary input key.
|
| 46 |
+
# HF vision 모델은 관례적으로 입력 키로 "pixel_values"를 기대.
|
| 47 |
model_input_names = ["pixel_values"]
|
| 48 |
|
| 49 |
def __init__(
|
| 50 |
self,
|
| 51 |
backbone_name_or_path: BackboneID,
|
| 52 |
+
is_training: bool = False, # timm 에서 data augmentation 용.
|
| 53 |
use_fast: bool = False,
|
| 54 |
**kwargs,
|
| 55 |
):
|
| 56 |
# ImageProcessingMixin stores extra kwargs and manages auto_map metadata.
|
| 57 |
+
# ImageProcessingMixin은 추가 kwargs를 저장하고 auto_map 메타를 관리.
|
| 58 |
super().__init__(**kwargs)
|
| 59 |
|
| 60 |
# Enforce whitelist via BACKBONE_META to keep behavior stable.
|
| 61 |
+
# 동작 안정성을 위해 BACKBONE_META 기반 화이트리스트를 강제. - fast fail
|
| 62 |
if backbone_name_or_path not in BACKBONE_META:
|
| 63 |
raise ValueError(
|
| 64 |
f"Unsupported backbone_name_or_path={backbone_name_or_path}. "
|
|
|
|
| 66 |
)
|
| 67 |
|
| 68 |
# Serializable fields only: these should appear in preprocessor_config.json.
|
| 69 |
+
# 직렬화 가능한 필드만: 이 값들만 preprocessor_config.json에 들어가야 함
|
| 70 |
self.backbone_name_or_path = backbone_name_or_path
|
| 71 |
self.is_training = bool(is_training)
|
| 72 |
|
| 73 |
# Reproducibility switch for transformers processors.
|
| 74 |
+
# transformers processor의 fast/slow 선택을 재현 가능하게 고정.
|
| 75 |
self.use_fast = bool(use_fast)
|
| 76 |
|
| 77 |
# Runtime-only fields: must never be serialized.
|
| 78 |
+
# 런타임 전용 필드: 절대 직렬화되면 안 됨.
|
| 79 |
self._meta = None
|
| 80 |
+
self._delegate = None
|
| 81 |
+
self._timm_transform = None
|
| 82 |
self._torchvision_transform = None
|
| 83 |
|
| 84 |
# Build runtime objects according to backbone type.
|
| 85 |
+
# backbone type에 따라 런타임 객체를 구성.
|
| 86 |
self._build_runtime()
|
| 87 |
|
| 88 |
# ============================================================
|
|
|
|
| 92 |
def _build_runtime(self):
|
| 93 |
"""
|
| 94 |
Build runtime delegate/transform based on BACKBONE_META["type"].
|
| 95 |
+
BACKBONE_META["type"]에 따라 런타임 delegate/transform을 구성.
|
| 96 |
"""
|
| 97 |
meta = BACKBONE_META[self.backbone_name_or_path]
|
| 98 |
self._meta = meta
|
| 99 |
|
| 100 |
# Always reset runtime fields before rebuilding.
|
| 101 |
+
# 재구성 전 런타임 필드는 항상 초기화.
|
| 102 |
self._delegate = None
|
| 103 |
self._timm_transform = None
|
| 104 |
self._torchvision_transform = None
|
|
|
|
| 107 |
|
| 108 |
if t == "timm_densenet":
|
| 109 |
# timm DenseNet uses timm.data transforms for ImageNet-style preprocessing.
|
| 110 |
+
# timm DenseNet은 ImageNet 전처리를 위해 timm.data transform을 사용.
|
| 111 |
self._timm_transform = self._build_timm_transform(
|
| 112 |
backbone_id=self.backbone_name_or_path,
|
| 113 |
is_training=self.is_training,
|
|
|
|
| 116 |
|
| 117 |
if t == "torchvision_densenet":
|
| 118 |
# torchvision DenseNet requires torchvision-style preprocessing (resize/crop/tensor/normalize).
|
| 119 |
+
# torchvision DenseNet은 torchvision 스타일 전처리(resize/crop/tensor/normalize)가 필요.
|
| 120 |
self._torchvision_transform = self._build_torchvision_densenet_transform(
|
| 121 |
is_training=self.is_training
|
| 122 |
)
|
| 123 |
return
|
| 124 |
|
| 125 |
# Default: transformers backbone delegates to its official AutoImageProcessor.
|
| 126 |
+
# 기본: transformers 백본은 공식 AutoImageProcessor에 위임.
|
| 127 |
#
|
| 128 |
# IMPORTANT:
|
| 129 |
+
# - use_fast는 transformers 기본값 변경에 흔들리지 않도록 반드시 명시적으로 전달.
|
| 130 |
self._delegate = AutoImageProcessor.from_pretrained(
|
| 131 |
self.backbone_name_or_path,
|
| 132 |
use_fast=self.use_fast,
|
|
|
|
| 137 |
def _build_timm_transform(*, backbone_id: str, is_training: bool):
|
| 138 |
"""
|
| 139 |
Create timm transform without storing non-serializable objects in config.
|
| 140 |
+
비직렬화 객체를 config에 저장하지 않고 timm transform을 생성.
|
| 141 |
"""
|
| 142 |
try:
|
| 143 |
import timm
|
|
|
|
| 148 |
) from e
|
| 149 |
|
| 150 |
# We only need model metadata to resolve data config, so pretrained=False is preferred.
|
| 151 |
+
# data config 추출만 필요하므로 pretrained=False를 우선 사용.
|
| 152 |
m = timm.create_model(f"hf_hub:{backbone_id}", pretrained=False, num_classes=0)
|
| 153 |
dc = resolve_model_data_config(m)
|
| 154 |
|
| 155 |
# create_transform returns a torchvision-like callable that maps PIL -> torch.Tensor(C,H,W).
|
| 156 |
+
# create_transform은 PIL -> torch.Tensor(C,H,W)로 매핑하는 callable을 반환.
|
| 157 |
+
tfm = create_transform(**dc, is_training=is_training) # is_training :Data Aug.
|
| 158 |
return tfm
|
| 159 |
|
| 160 |
@staticmethod
|
| 161 |
def _build_torchvision_densenet_transform(*, is_training: bool):
|
| 162 |
"""
|
| 163 |
Build torchvision preprocessing for DenseNet-121 (224 pipeline).
|
| 164 |
+
DenseNet-121용 torchvision 전처리(224 파이프라인)를 구성.
|
| 165 |
"""
|
| 166 |
try:
|
| 167 |
from torchvision import transforms
|
|
|
|
| 171 |
) from e
|
| 172 |
|
| 173 |
# These are the standard ImageNet normalization stats used by torchvision weights.
|
| 174 |
+
# 이 값들은 torchvision weights가 사용하는 표준 ImageNet 정규화 통계.
|
| 175 |
mean = (0.485, 0.456, 0.406)
|
| 176 |
+
std = (0.229, 0.224, 0.225)
|
| 177 |
|
| 178 |
# Training pipeline typically uses RandomResizedCrop and horizontal flip.
|
| 179 |
+
# 학습 파이프라인은 보통 RandomResizedCrop과 좌우반전을 사용.
|
| 180 |
if is_training:
|
| 181 |
return transforms.Compose(
|
| 182 |
[
|
| 183 |
+
# transforms.RandomResizedCrop(224),
|
| 184 |
+
# transforms.RandomHorizontalFlip(p=0.5),
|
| 185 |
+
transforms.Resize(224),
|
| 186 |
transforms.ToTensor(),
|
| 187 |
transforms.Normalize(mean=mean, std=std),
|
| 188 |
]
|
| 189 |
)
|
| 190 |
|
| 191 |
# Inference pipeline typically uses Resize(256) + CenterCrop(224).
|
| 192 |
+
# 추론 파이프라인은 보통 Resize(256) + CenterCrop(224)를 사용.
|
| 193 |
return transforms.Compose(
|
| 194 |
[
|
| 195 |
transforms.Resize(256),
|
| 196 |
+
# transforms.CenterCrop(224),
|
| 197 |
transforms.ToTensor(),
|
| 198 |
transforms.Normalize(mean=mean, std=std),
|
| 199 |
]
|
|
|
|
| 206 |
def to_dict(self) -> dict[str, Any]:
|
| 207 |
"""
|
| 208 |
Return a JSON-serializable dict for preprocessor_config.json.
|
| 209 |
+
preprocessor_config.json에 들어갈 JSON 직렬화 dict를 반환.
|
| 210 |
|
| 211 |
Important: do not leak runtime objects into the serialized dict.
|
| 212 |
+
중요: 런타임 객체가 직렬화 dict에 섞이면 안 됨.
|
| 213 |
"""
|
| 214 |
# ImageProcessingMixin.to_dict() adds metadata such as image_processor_type/auto_map.
|
| 215 |
# ImageProcessingMixin.to_dict()는 image_processor_type/auto_map 같은 메타를 추가합니다.
|
| 216 |
d = super().to_dict()
|
| 217 |
|
| 218 |
# Force minimal stable fields for long-term compatibility.
|
| 219 |
+
# 장기 호환을 위해 최소 안정 필드를 강제로 지정.
|
| 220 |
+
d["image_processor_type"] = self.__class__.__name__
|
| 221 |
d["backbone_name_or_path"] = self.backbone_name_or_path
|
| 222 |
d["is_training"] = self.is_training
|
| 223 |
+
d["use_fast"] = self.use_fast
|
| 224 |
|
| 225 |
# Remove any runtime-only fields defensively.
|
| 226 |
+
# 런타임 전용 필드는 보수적으로 제거.
|
| 227 |
for key in ["_meta", "_delegate", "_timm_transform", "_torchvision_transform"]:
|
| 228 |
d.pop(key, None)
|
| 229 |
|
|
|
|
| 233 |
def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs):
|
| 234 |
"""
|
| 235 |
Standard load path used by BaseImageProcessor / AutoImageProcessor.
|
| 236 |
+
BaseImageProcessor / AutoImageProcessor가 사용하는 표준 로드 경로임.
|
| 237 |
"""
|
| 238 |
backbone = image_processor_dict.get("backbone_name_or_path", None)
|
| 239 |
if backbone is None:
|
| 240 |
raise ValueError("preprocessor_config.json missing key: backbone_name_or_path")
|
| 241 |
|
| 242 |
is_training = bool(image_processor_dict.get("is_training", False))
|
| 243 |
+
use_fast = bool(image_processor_dict.get("use_fast", False))
|
| 244 |
|
| 245 |
return cls(
|
| 246 |
backbone_name_or_path=backbone,
|
|
|
|
| 253 |
def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
|
| 254 |
"""
|
| 255 |
Fallback path if AutoImageProcessor calls class.from_pretrained directly.
|
| 256 |
+
AutoImageProcessor가 class.from_pretrained를 직접 호출하는 경우를 대비한 메서드.
|
| 257 |
|
| 258 |
Strategy:
|
| 259 |
전략:
|
| 260 |
|
| 261 |
- Read config.json via AutoConfig and recover backbone_name_or_path.
|
| 262 |
+
AutoConfig로 config.json을 읽고 backbone_name_or_path를 복구.
|
| 263 |
"""
|
| 264 |
|
| 265 |
# is_training is runtime-only and should default to False for inference/serving.
|
| 266 |
+
# is_training은 런타임 전용이며 추론/서빙 기본값은 False 임.
|
| 267 |
#
|
| 268 |
# IMPORTANT:
|
| 269 |
+
# - use_fast는 kwargs로 전달될 수 있으므로, 있으면 반영.
|
| 270 |
use_fast = bool(kwargs.pop("use_fast", False))
|
| 271 |
|
| 272 |
kwargs.pop("trust_remote_code", None)
|
|
|
|
| 287 |
@staticmethod
|
| 288 |
def _ensure_list(images: Any) -> list[Any]:
|
| 289 |
# Normalize scalar image input to a list for uniform processing.
|
| 290 |
+
# 단일 입력을 리스트로 정규화하여 동일한 처리 경로를 사용.
|
| 291 |
if isinstance(images, (list, tuple)):
|
| 292 |
return list(images)
|
| 293 |
return [images]
|
|
|
|
| 295 |
@staticmethod
|
| 296 |
def _to_pil_rgb(x: Any):
|
| 297 |
# Convert common image inputs into PIL RGB images.
|
| 298 |
+
# 일반적인 입력을 PIL RGB 이미지로 변환.
|
| 299 |
from PIL import Image as PILImage
|
| 300 |
|
| 301 |
if isinstance(x, PILImage.Image):
|
|
|
|
| 312 |
) -> dict[str, Any]:
|
| 313 |
"""
|
| 314 |
Convert images into {"pixel_values": Tensor/ndarray}.
|
| 315 |
+
이미지를 {"pixel_values": Tensor/ndarray}로 변환.
|
| 316 |
"""
|
| 317 |
images = self._ensure_list(images)
|
| 318 |
|
| 319 |
# Rebuild runtime if needed (e.g., right after deserialization).
|
| 320 |
+
# 직렬화 복원 직후 등 런타임이 비어있을 수 있으므로 재구성.
|
| 321 |
if (self._delegate is None) and (self._timm_transform is None) and (self._torchvision_transform is None):
|
| 322 |
self._build_runtime()
|
| 323 |
|
| 324 |
# timm path: PIL -> torch.Tensor(C,H,W) normalized float32.
|
| 325 |
+
# timm 경로: PIL -> torch.Tensor(C,H,W) 정규화 float32.
|
| 326 |
if self._timm_transform is not None:
|
| 327 |
pv: list[torch.Tensor] = []
|
| 328 |
for im in images:
|
|
|
|
| 335 |
return self._format_return(pixel_values, return_tensors)
|
| 336 |
|
| 337 |
# torchvision path: PIL -> torch.Tensor(C,H,W) normalized float32.
|
| 338 |
+
# torchvision 경로: PIL -> torch.Tensor(C,H,W) 정규화 float32.
|
| 339 |
if self._torchvision_transform is not None:
|
| 340 |
pv: list[torch.Tensor] = []
|
| 341 |
for im in images:
|
|
|
|
| 348 |
return self._format_return(pixel_values, return_tensors)
|
| 349 |
|
| 350 |
# transformers delegate path: rely on official processor behavior.
|
| 351 |
+
# transformers 위임 경로: 공식 processor 동작을 그대로 사용.
|
| 352 |
if self._delegate is None:
|
| 353 |
raise RuntimeError("Processor runtime not built: delegate is None and no transforms are available.")
|
| 354 |
|
|
|
|
| 358 |
def _format_return(pixel_values: torch.Tensor, return_tensors: str | TensorType | None) -> dict[str, Any]:
|
| 359 |
"""
|
| 360 |
Format pixel_values according to return_tensors.
|
| 361 |
+
return_tensors에 맞춰 pixel_values 반환 포맷을 변환.
|
| 362 |
"""
|
| 363 |
if return_tensors is None or return_tensors in ("pt", TensorType.PYTORCH):
|
| 364 |
return {"pixel_values": pixel_values}
|
|
|
|
| 368 |
|
| 369 |
|
| 370 |
# Register this processor for AutoImageProcessor resolution.
|
| 371 |
+
# AutoImageProcessor 해석을 위해 이 processor를 등록.
|
| 372 |
if __name__ != "__main__":
|
| 373 |
BackboneMLPHead224ImageProcessor.register_for_auto_class("AutoImageProcessor")
|
models/torchvision__densenet121/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 33394052
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a3d373da2c45fdd83a13526586f67a2ccdc791505d1d5d26f878d6cb2a982e87
|
| 3 |
size 33394052
|