from transformers import PretrainedConfig class F2PDecoderConfig(PretrainedConfig): """Configuration for a feature-to-pixel reconstruction decoder.""" model_type = "f2p_decoder" def __init__( self, pretrained_encoder_name: str = "google/siglip2-so400m-patch14-224", source_decoder_repo: str = "nyu-visionx/siglip2_decoder", image_size: int = 224, patch_size: int = 14, num_channels: int = 3, hidden_size: int = 1152, decoder_hidden_size: int = 1152, decoder_num_hidden_layers: int = 28, decoder_num_attention_heads: int = 16, decoder_intermediate_size: int = 4096, hidden_act: str = "gelu", hidden_dropout_prob: float = 0.0, attention_probs_dropout_prob: float = 0.0, initializer_range: float = 0.02, layer_norm_eps: float = 1e-12, qkv_bias: bool = True, num_patches: int = 256, drop_cls_token: bool = True, image_mean: list[float] | None = None, image_std: list[float] | None = None, **kwargs, ) -> None: super().__init__(**kwargs) if getattr(self, "auto_map", None) is None: self.auto_map = { "AutoConfig": "configuration_f2p_decoder.F2PDecoderConfig", "AutoModel": "modeling_f2p_decoder.F2PDecoderModel", } if image_mean is None: image_mean = [0.5, 0.5, 0.5] if image_std is None: image_std = [0.5, 0.5, 0.5] if len(image_mean) != num_channels or len(image_std) != num_channels: raise ValueError("image_mean and image_std must match num_channels.") if not drop_cls_token: raise ValueError("Only drop_cls_token=True is supported by this decoder.") self.pretrained_encoder_name = pretrained_encoder_name self.source_decoder_repo = source_decoder_repo self.image_size = int(image_size) self.patch_size = int(patch_size) self.num_channels = int(num_channels) self.hidden_size = int(hidden_size) self.decoder_hidden_size = int(decoder_hidden_size) self.decoder_num_hidden_layers = int(decoder_num_hidden_layers) self.decoder_num_attention_heads = int(decoder_num_attention_heads) self.decoder_intermediate_size = int(decoder_intermediate_size) self.hidden_act = hidden_act self.hidden_dropout_prob = float(hidden_dropout_prob) self.attention_probs_dropout_prob = float(attention_probs_dropout_prob) self.initializer_range = float(initializer_range) self.layer_norm_eps = float(layer_norm_eps) self.qkv_bias = bool(qkv_bias) self.num_patches = int(num_patches) self.drop_cls_token = bool(drop_cls_token) self.image_mean = [float(value) for value in image_mean] self.image_std = [float(value) for value in image_std]