base

Browse files

Files changed (3) hide show

config.json +24 -0
dino.safetensors +3 -0
modeling_dinov2_dual.py +97 -0

config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "architectures": [
+    "DualChannelDINOv2Model"
+  ],
+  "attention_probs_dropout_prob": 0.0,
+  "drop_path_rate": 0.0,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.0,
+  "hidden_size": 768,
+  "image_size": 518,
+  "initializer_range": 0.02,
+  "layer_norm_eps": 1e-06,
+  "layerscale_value": 1.0,
+  "mlp_ratio": 4,
+  "model_type": "dinov2",
+  "num_attention_heads": 12,
+  "num_channels": 3,
+  "num_hidden_layers": 12,
+  "patch_size": 14,
+  "qkv_bias": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.31.0.dev0",
+  "use_swiglu_ffn": false
+}

dino.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f7718d343aa0369b8e730bbbb0f3b68516668869f3c8fe79945934572268088a
+size 229915824

modeling_dinov2_dual.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import torch
+import torch.nn as nn
+import copy
+from transformers import PreTrainedModel, Dinov2Config, Dinov2Model
+class DualChannelDINOv2Model(PreTrainedModel):
+    """
+    A modified DINOv2 model that uses DualChannelDINOv2Attention
+    for each self-attention layer.
+    """
+    config_class = Dinov2Config
+    def __init__(self, config: Dinov2Config):
+        super().__init__(config)
+        self.encoder = Dinov2Model(config).encoder
+        self.encoder = add_dual_channel_attention_to_dino(self.encoder)
+def add_dual_channel_attention_to_dino(dino_encoder: nn.Module):
+    """
+    Traverse DINOv2Model, wrapping each layer's self_attn (DINOv2Attention)
+    with DualChannelDINOv2Attention. The original attention is frozen, and
+    a second trainable copy is created.
+    """
+    config = dino_encoder.config
+    for idx, layer in enumerate(dino_encoder.layer):
+        old_attn = layer.attention
+        # Create our wrapper with the old (frozen) attn
+        dual_attn = DualChannelDINOv2Attention(
+            attention_base=old_attn,
+            config=config,
+            layer_idx=idx
+        )
+        layer.attention = dual_attn
+    return dino_encoder
+class DualChannelDINOv2Attention(nn.Module):
+    """
+    A wrapper that keeps two instances of DINOv2Attention:
+      1) attention_base (frozen)
+      2) attention_plus (trainable)
+    and fuses their outputs via a learnable alpha.
+    """
+    def __init__(self, attention_base: nn.Module, config):
+        super().__init__(config)
+        self.attention_base = attention_base
+        self.attention_plus = copy.deepcopy(attention_base)
+        # A learnable alpha parameter (vector of size [hidden_size])
+        self.alpha_param = nn.Parameter(torch.zeros(config.hidden_size))
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: torch.Tensor = None,
+        output_attentions: bool = False
+    ):
+        """
+        Runs both attention_base and attention_plus, fuses results.
+        """
+        # ---- base branch ----
+        base_ret = self.attention_base(
+            hidden_states,
+            head_mask=head_mask,
+            output_attentions=output_attentions
+        )
+        if isinstance(base_ret, tuple):
+            base_out = base_ret[0]
+            base_attn = base_ret[1] if len(base_ret) > 1 else None
+        else:
+            base_out, base_attn = base_ret, None
+        # ---- plus branch ----
+        plus_ret = self.attention_plus(
+            hidden_states,
+            head_mask=head_mask,
+            output_attentions=output_attentions
+        )
+        if isinstance(plus_ret, tuple):
+            plus_out = plus_ret[0]
+            plus_attn = plus_ret[1] if len(plus_ret) > 1 else None
+        else:
+            plus_out, plus_attn = plus_ret, None
+        # ---- fuse outputs ----
+        alpha = torch.sigmoid(self.alpha_param).view(1, 1, -1).to(dtype=base_out.dtype)
+        fused_out = alpha * base_out + (1.0 - alpha) * plus_out
+        if output_attentions:
+            # Base의 attention map을 그대로 반환 (가장 안전한 호환 방식)
+            return fused_out, base_attn
+        else:
+            # Hugging Face 규약: 길이 1짜리 튜플
+            return (fused_out,)