f2p_decoder / configuration_f2p_decoder.py
toilaluan's picture
Upload folder using huggingface_hub
09b2c2d verified
from transformers import PretrainedConfig
class F2PDecoderConfig(PretrainedConfig):
"""Configuration for a feature-to-pixel reconstruction decoder."""
model_type = "f2p_decoder"
def __init__(
self,
pretrained_encoder_name: str = "google/siglip2-so400m-patch14-224",
source_decoder_repo: str = "nyu-visionx/siglip2_decoder",
image_size: int = 224,
patch_size: int = 14,
num_channels: int = 3,
hidden_size: int = 1152,
decoder_hidden_size: int = 1152,
decoder_num_hidden_layers: int = 28,
decoder_num_attention_heads: int = 16,
decoder_intermediate_size: int = 4096,
hidden_act: str = "gelu",
hidden_dropout_prob: float = 0.0,
attention_probs_dropout_prob: float = 0.0,
initializer_range: float = 0.02,
layer_norm_eps: float = 1e-12,
qkv_bias: bool = True,
num_patches: int = 256,
drop_cls_token: bool = True,
image_mean: list[float] | None = None,
image_std: list[float] | None = None,
**kwargs,
) -> None:
super().__init__(**kwargs)
if getattr(self, "auto_map", None) is None:
self.auto_map = {
"AutoConfig": "configuration_f2p_decoder.F2PDecoderConfig",
"AutoModel": "modeling_f2p_decoder.F2PDecoderModel",
}
if image_mean is None:
image_mean = [0.5, 0.5, 0.5]
if image_std is None:
image_std = [0.5, 0.5, 0.5]
if len(image_mean) != num_channels or len(image_std) != num_channels:
raise ValueError("image_mean and image_std must match num_channels.")
if not drop_cls_token:
raise ValueError("Only drop_cls_token=True is supported by this decoder.")
self.pretrained_encoder_name = pretrained_encoder_name
self.source_decoder_repo = source_decoder_repo
self.image_size = int(image_size)
self.patch_size = int(patch_size)
self.num_channels = int(num_channels)
self.hidden_size = int(hidden_size)
self.decoder_hidden_size = int(decoder_hidden_size)
self.decoder_num_hidden_layers = int(decoder_num_hidden_layers)
self.decoder_num_attention_heads = int(decoder_num_attention_heads)
self.decoder_intermediate_size = int(decoder_intermediate_size)
self.hidden_act = hidden_act
self.hidden_dropout_prob = float(hidden_dropout_prob)
self.attention_probs_dropout_prob = float(attention_probs_dropout_prob)
self.initializer_range = float(initializer_range)
self.layer_norm_eps = float(layer_norm_eps)
self.qkv_bias = bool(qkv_bias)
self.num_patches = int(num_patches)
self.drop_cls_token = bool(drop_cls_token)
self.image_mean = [float(value) for value in image_mean]
self.image_std = [float(value) for value in image_std]