| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| """ CCT model configuration""" |
|
|
| from transformers import PretrainedConfig |
|
|
| CCT_PRETRAINED_CONFIG_ARCHIVE_MAP = { |
| "rishabbala/cct_14_7x2_384": "https://huggingface.co/rishabbala/cct_14_7x2_384/blob/main/config.json", |
| } |
|
|
|
|
| class CctConfig(PretrainedConfig): |
| r""" |
| This is the configuration class to store the configuration of a [`CctModel`]. It is used to instantiate a CCT model |
| according to the specified arguments, defining the model architecture. Instantiating a configuration with the |
| defaults will yield a similar configuration to that of the CCT |
| [rishabbala/cct](https://huggingface.co/rishabbala/cct) architecture. |
| |
| Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the |
| documentation from [`PretrainedConfig`] for more information. |
| |
| Args: |
| img_size (`int`, *optional*, defaults to 384): |
| The size of the input image |
| in_channels (`int`, *optional*, defaults to 3): |
| The number of input channels. |
| out_channels (`List[int]`, *optional*, defaults to [64, 384]): |
| The number of output channels of each conv layer. |
| conv_kernel_size (`int`, *optional*, defaults to 7): |
| The kernel size of convolutional layers in patch embedding. |
| conv_stride (`int`, *optional*, defaults to 2): |
| The stride size of convolutional layers in patch embedding. |
| conv_padding (`int`, *optional*, defaults to 3): |
| The padding size of convolutional layers in patch embedding. |
| conv_bias (`bool`, *optional*, defaults to False): |
| Whether the convolutional layers have bias |
| pool_kernel_size (`int`, *optional*, defaults to 7): |
| The kernel size of max pool layers in patch embedding. |
| pool_stride (`int`, *optional*, defaults to 2): |
| The stride size of max pool layers in patch embedding. |
| pool_padding (`int`, *optional*, defaults to 3): |
| The padding size of max pool layers in patch embedding. |
| num_conv_layers (`int`, *optional*, defaults to 2): |
| Number of convolutional embedding layers |
| embed_dim (`int`, *optional*, defaults to 384): |
| Dimension of each of the encoder blocks. |
| num_heads (`int`, *optional*, defaults to 6): |
| Number of attention heads for each attention layer in each block of the Transformer encoder. |
| mlp_ratio (`float`, *optional*, defaults to 3.0): |
| Ratio of the size of the hidden layer compared to the size of the input layer of the FFNs in the encoder |
| blocks. |
| attention_drop_rate (`float`, *optional*, defaults to 0.1): |
| The dropout ratio for the attention probabilities. |
| drop_rate (`float`, *optional*, defaults to 0.0): |
| The dropout ratio following linear projections. |
| drop_path_rate (`float`, *optional*, defaults to `0.0`): |
| The dropout probability for stochastic depth, used in the blocks of the Transformer encoder. |
| num_transformer_layers(`int`, *optional*, defaults to 14): |
| Number of transformer self-attention layers |
| pos_emb_type (`str`, *optional*, defaults to 'learnable'): |
| Type of positional embedding used. Alternative: 'sinusoidal' |
| |
| Example: |
| |
| ```python |
| >>> from transformers import CctConfig, CctModel |
| |
| >>> # Initializing a Cct msft/cct style configuration |
| >>> configuration = CctConfig() |
| |
| >>> # Initializing a model (with random weights) from the msft/cct style configuration |
| >>> model = CctModel(configuration) |
| |
| >>> # Accessing the model configuration |
| >>> configuration = model.config |
| ```""" |
| model_type = "cct" |
|
|
| def __init__( |
| self, |
| img_size=384, |
| in_channels=3, |
| out_channels=[64, 384], |
| conv_kernel_size=7, |
| conv_stride=2, |
| conv_padding=3, |
| conv_bias=False, |
| pool_kernel_size=3, |
| pool_stride=2, |
| pool_padding=1, |
| num_conv_layers=2, |
| embed_dim=384, |
| num_heads=6, |
| mlp_ratio=3, |
| attention_drop_rate=0.1, |
| drop_rate=0.0, |
| drop_path_rate=0.0, |
| num_transformer_layers=14, |
| pos_emb_type="learnable", |
| **kwargs, |
| ): |
| super().__init__(**kwargs) |
| self.img_size = img_size |
| self.in_channels = in_channels |
| self.out_channels = out_channels |
| self.num_channels = out_channels[-1] |
| self.conv_kernel_size = conv_kernel_size |
| self.conv_stride = conv_stride |
| self.conv_padding = conv_padding |
| self.conv_bias = conv_bias |
| self.pool_kernel_size = pool_kernel_size |
| self.pool_stride = pool_stride |
| self.pool_padding = pool_padding |
| self.num_conv_layers = num_conv_layers |
| self.embed_dim = embed_dim |
| self.num_heads = num_heads |
| self.mlp_ratio = mlp_ratio |
| self.attention_drop_rate = attention_drop_rate |
| self.drop_rate = drop_rate |
| self.drop_path_rate = drop_path_rate |
| self.num_transformer_layers = num_transformer_layers |
| self.pos_emb_type = pos_emb_type |
|
|