File size: 2,923 Bytes
09b2c2d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from transformers import PretrainedConfig


class F2PDecoderConfig(PretrainedConfig):
    """Configuration for a feature-to-pixel reconstruction decoder."""

    model_type = "f2p_decoder"

    def __init__(
        self,
        pretrained_encoder_name: str = "google/siglip2-so400m-patch14-224",
        source_decoder_repo: str = "nyu-visionx/siglip2_decoder",
        image_size: int = 224,
        patch_size: int = 14,
        num_channels: int = 3,
        hidden_size: int = 1152,
        decoder_hidden_size: int = 1152,
        decoder_num_hidden_layers: int = 28,
        decoder_num_attention_heads: int = 16,
        decoder_intermediate_size: int = 4096,
        hidden_act: str = "gelu",
        hidden_dropout_prob: float = 0.0,
        attention_probs_dropout_prob: float = 0.0,
        initializer_range: float = 0.02,
        layer_norm_eps: float = 1e-12,
        qkv_bias: bool = True,
        num_patches: int = 256,
        drop_cls_token: bool = True,
        image_mean: list[float] | None = None,
        image_std: list[float] | None = None,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        if getattr(self, "auto_map", None) is None:
            self.auto_map = {
                "AutoConfig": "configuration_f2p_decoder.F2PDecoderConfig",
                "AutoModel": "modeling_f2p_decoder.F2PDecoderModel",
            }

        if image_mean is None:
            image_mean = [0.5, 0.5, 0.5]
        if image_std is None:
            image_std = [0.5, 0.5, 0.5]
        if len(image_mean) != num_channels or len(image_std) != num_channels:
            raise ValueError("image_mean and image_std must match num_channels.")
        if not drop_cls_token:
            raise ValueError("Only drop_cls_token=True is supported by this decoder.")

        self.pretrained_encoder_name = pretrained_encoder_name
        self.source_decoder_repo = source_decoder_repo
        self.image_size = int(image_size)
        self.patch_size = int(patch_size)
        self.num_channels = int(num_channels)
        self.hidden_size = int(hidden_size)
        self.decoder_hidden_size = int(decoder_hidden_size)
        self.decoder_num_hidden_layers = int(decoder_num_hidden_layers)
        self.decoder_num_attention_heads = int(decoder_num_attention_heads)
        self.decoder_intermediate_size = int(decoder_intermediate_size)
        self.hidden_act = hidden_act
        self.hidden_dropout_prob = float(hidden_dropout_prob)
        self.attention_probs_dropout_prob = float(attention_probs_dropout_prob)
        self.initializer_range = float(initializer_range)
        self.layer_norm_eps = float(layer_norm_eps)
        self.qkv_bias = bool(qkv_bias)
        self.num_patches = int(num_patches)
        self.drop_cls_token = bool(drop_cls_token)
        self.image_mean = [float(value) for value in image_mean]
        self.image_std = [float(value) for value in image_std]