File size: 7,527 Bytes

17461fb

from typing import Literal, Any

# ============================================================
# 1) Allowed 224 backbones (Fixed whitelist)
# 1) 허용 224 백본 (화이트리스트 고정)
# ============================================================
# This Literal defines the only backbone identifiers that are allowed in configs.
# 이 Literal은 config에서 허용되는 backbone 식별자 집합을 강제합니다.
BackboneID = Literal[
    "google/vit-base-patch16-224",
    "microsoft/swin-tiny-patch4-window7-224",
    "microsoft/resnet-50",
    "google/efficientnet-b0",
    "timm/densenet121.tv_in1k",
    "torchvision/densenet121",
]

# ============================================================
# 2) Backbone metadata registry (Feature dim/rule/unfreeze rule)
# 2) 백본 메타 레지스트리 (feature dim/rule/unfreeze rule 고정)
# ============================================================
# This table is the single source of truth for feature extraction and fine-tuning rules per backbone.
# 이 테이블은 backbone별 feature 추출 및 미세조정 규칙의 단일 기준(source of truth)입니다.
#
# The key type is BackboneID to ensure meta keys never drift from the whitelist.
# 키 타입을 BackboneID로 고정하여 메타 키가 화이트리스트와 어긋나지 않게 합니다.
BACKBONE_META: dict[BackboneID, dict[str, Any]] = {
    # -------------------------
    # Transformers (ViT/Swin)
    # -------------------------
    # These backbones come from transformers and typically output hidden states and/or pooler outputs.
    # 이 백본들은 transformers 계열이며 hidden states와 pooler 출력 등을 제공합니다.

    "google/vit-base-patch16-224": {
        # type indicates which loading/forward/extraction pathway the model code should use.
        # type은 모델 코드가 어떤 로딩/forward/feature 추출 경로를 사용할지 결정합니다.
        "type": "vit",

        # feat_dim is the feature vector dimension consumed by the MLP head.
        # feat_dim은 MLP head가 입력으로 받는 feature 벡터 차원입니다.
        "feat_dim": 768,

        # feat_rule defines how to get a (B, feat_dim) tensor from backbone outputs.
        # feat_rule은 backbone 출력에서 (B, feat_dim) 텐서를 얻는 규칙을 정의합니다.
        "feat_rule": "cls",  # Use last_hidden_state[:, 0, :] as CLS token embedding.
                            # last_hidden_state[:, 0, :]를 CLS 토큰 임베딩으로 사용합니다.

        # unfreeze defines the policy to unfreeze layers during stage2 fine-tuning.
        # unfreeze는 stage2 미세조정에서 어떤 레이어를 풀지 정책을 정의합니다.
        "unfreeze": "last_n",  # Unfreeze the last n encoder blocks.
                               # encoder 블록의 마지막 n개를 unfreeze 합니다.

        # has_bn indicates whether BatchNorm exists and should be handled carefully when freezing.
        # has_bn은 BatchNorm 존재 여부이며 freeze 시 특별 취급이 필요한지 판단에 사용합니다.
        "has_bn": False,
    },

    "microsoft/swin-tiny-patch4-window7-224": {
        # This backbone is a Swin Transformer, which may or may not provide a pooler output depending on implementation.
        # 이 백본은 Swin Transformer이며 구현에 따라 pooler output 제공 여부가 달라질 수 있습니다.
        "type": "swin",
        "feat_dim": 768,

        # Prefer pooler output if available, otherwise fall back to mean pooling.
        # pooler가 있으면 우선 사용하고, 없으면 mean pooling으로 대체합니다.
        "feat_rule": "pool_or_mean",

        # Unfreeze strategy is aligned with transformer-style encoder blocks.
        # unfreeze 전략은 transformer 계열 encoder 블록 기준으로 맞춥니다.
        "unfreeze": "last_n",
        "has_bn": False,
    },

    # -------------------------
    # Transformers (CNNs)
    # -------------------------
    # These backbones are CNNs exposed via transformers, usually producing pooled feature vectors or feature maps.
    # 이 백본들은 transformers로 노출된 CNN이며 pooled feature 또는 feature map을 제공합니다.

    "microsoft/resnet-50": {
        # This entry assumes a transformers-compatible ResNet that can expose pooler or a final feature map.
        # 이 항목은 transformers 호환 ResNet이 pooler 또는 최종 feature map을 제공할 수 있다고 가정합니다.
        "type": "resnet",
        "feat_dim": 2048,

        # Use pooler output if the model provides it, otherwise apply global average pooling (GAP).
        # pooler가 있으면 사용하고, 없으면 global average pooling(GAP)을 적용합니다.
        "feat_rule": "pool_or_gap",

        # CNN unfreeze policy can still be expressed as "last_n" at a block/stage granularity in your model code.
        # CNN도 모델 코드에서 block/stage 단위로 last_n 정책을 적용할 수 있습니다.
        "unfreeze": "last_n",
        "has_bn": True,
    },

    "google/efficientnet-b0": {
        # This entry assumes a transformers-compatible EfficientNet that exposes pooled features or a final feature map.
        # 이 항목은 transformers 호환 EfficientNet이 pooled feature 또는 최종 feature map을 제공한다고 가정합니다.
        "type": "efficientnet",
        "feat_dim": 1280,
        "feat_rule": "pool_or_gap",
        "unfreeze": "last_n",
        "has_bn": True,
    },

    # -------------------------
    # timm (DenseNet via HF Hub)
    # -------------------------
    # This backbone is loaded via timm using the "hf_hub:" prefix in your model loader.
    # 이 백본은 모델 로더에서 timm의 "hf_hub:" 프리픽스를 사용해 로드합니다.
    "timm/densenet121.tv_in1k": {
        "type": "timm_densenet",

        # DenseNet-121 final channel dimension is 1024 for the canonical architecture.
        # DenseNet-121의 표준 아키텍처에서 최종 채널 차원은 1024입니다.
        "feat_dim": 1024,

        # timm forward_features typically returns a feature map that you then GAP to (B, C).
        # timm의 forward_features는 보통 feature map을 반환하고 이후 GAP으로 (B, C)를 만듭니다.
        "feat_rule": "timm_gap",

        # DenseNet uses BatchNorm heavily, so freeze_bn behavior matters for stage1/stage2.
        # DenseNet은 BatchNorm 사용이 많아 stage1/stage2에서 freeze_bn 처리가 중요합니다.
        "unfreeze": "last_n",
        "has_bn": True,
    },

    # -------------------------
    # torchvision (DenseNet direct)
    # -------------------------
    # This backbone is intended for torchvision-style loading and feature extraction, not transformers/timm.
    # 이 백본은 transformers/timm이 아니라 torchvision 스타일 로딩 및 feature 추출을 대상으로 합니다.
    "torchvision/densenet121": {
        "type": "torchvision_densenet",
        "feat_dim": 1024,

        # torchvision DenseNet usually exposes .features and you apply GAP to obtain (B, C).
        # torchvision DenseNet은 보통 .features를 노출하며 GAP으로 (B, C)를 얻습니다.
        "feat_rule": "torchvision_densenet_gap",

        # Unfreeze policy remains last_n, but the interpretation must match torchvision module naming.
        # unfreeze 정책은 last_n을 유지하되, 해석은 torchvision 모듈 네이밍에 맞아야 합니다.
        "unfreeze": "last_n",
        "has_bn": True,
    },
}