| """HuggingFace PretrainedConfig for the SkySense++ model.""" |
|
|
| from transformers import PretrainedConfig |
|
|
|
|
| class SkySensePPConfig(PretrainedConfig): |
| """Configuration class for the SkySense++ multi-modal remote sensing model. |
| |
| This config captures all hyperparameters for the three backbones |
| (HR / S2 / S1), the fusion encoder, the modality-completion VAE, |
| and the decode head. |
| |
| Args: |
| hr_arch (str): SwinTransformerV2 architecture variant. Default ``"huge"``. |
| hr_img_size (int): HR input image size. Default ``512``. |
| hr_patch_size (int): HR patch size. Default ``4``. |
| hr_in_channels (int): HR input channels. Default ``3``. |
| hr_window_size (int): HR window attention size. Default ``8``. |
| hr_drop_path_rate (float): HR stochastic-depth rate. Default ``0.2``. |
| hr_out_indices (tuple): HR output stage indices. Default ``(0, 1, 2, 3)``. |
| hr_use_abs_pos_embed (bool): Use absolute position embeddings in HR. |
| Default ``False``. |
| hr_with_cp (bool): Use activation checkpointing in HR. Default ``True``. |
| hr_pad_small_map (bool): Pad small feature maps in HR. Default ``True``. |
| |
| s2_img_size (tuple): S2 input image size. Default ``(16, 16)``. |
| s2_patch_size (int): S2 patch size. Default ``16``. |
| s2_in_channels (int): S2 input channels. Default ``10``. |
| s2_embed_dims (int): S2 embedding dimensions. Default ``1024``. |
| s2_num_layers (int): S2 transformer layers. Default ``24``. |
| s2_num_heads (int): S2 attention heads. Default ``16``. |
| s2_mlp_ratio (int): S2 MLP expansion ratio. Default ``4``. |
| s2_out_indices (tuple): S2 output layer indices. Default ``(5, 11, 17, 23)``. |
| s2_drop_path_rate (float): S2 stochastic-depth rate. Default ``0.3``. |
| |
| s1_img_size (tuple): S1 input image size. Default ``(16, 16)``. |
| s1_patch_size (int): S1 patch size. Default ``16``. |
| s1_in_channels (int): S1 input channels. Default ``2``. |
| s1_embed_dims (int): S1 embedding dimensions. Default ``1024``. |
| s1_num_layers (int): S1 transformer layers. Default ``24``. |
| s1_num_heads (int): S1 attention heads. Default ``16``. |
| |
| fusion_input_dims (int): Fusion encoder input dims. Default ``2816``. |
| fusion_embed_dims (int): Fusion encoder embed dims. Default ``1024``. |
| fusion_num_layers (int): Fusion encoder layers. Default ``24``. |
| fusion_num_heads (int): Fusion encoder heads. Default ``16``. |
| fusion_with_cls_token (bool): Use CLS token in fusion. Default ``True``. |
| fusion_output_cls_token (bool): Output CLS token from fusion. |
| Default ``True``. |
| |
| decode_in_channels (list): Decode head input channel list. |
| Default ``[704, 704, 1408, 2816, 1024]``. |
| decode_channels (int): Decode head internal channels. Default ``512``. |
| decode_num_classes (int): Number of segmentation classes. Default ``65``. |
| |
| vocabulary_size (int): Vocabulary size for masked-label tokenisation. |
| Default ``64``. |
| sources (list): Active modality sources. Default ``["hr", "s2", "s1"]``. |
| use_modal_vae (bool): Enable modality-completion VAE. Default ``True``. |
| calendar_time (int): Calendar time embedding size. Default ``366``. |
| vae_subfolder (str): Subfolder for VAE weights (diffusers layout). Default ``"modality_vae"``. |
| VAE loads from ``{path}/{vae_subfolder}/diffusion_pytorch_model.safetensors``, |
| with fallback to ``{path}/modality_vae.safetensors``. |
| """ |
|
|
| model_type = "skysensepp" |
|
|
| def __init__( |
| self, |
| |
| hr_arch: str = "huge", |
| hr_img_size: int = 512, |
| hr_patch_size: int = 4, |
| hr_in_channels: int = 3, |
| hr_window_size: int = 8, |
| hr_drop_path_rate: float = 0.2, |
| hr_out_indices: tuple = (0, 1, 2, 3), |
| hr_use_abs_pos_embed: bool = False, |
| hr_with_cp: bool = True, |
| hr_pad_small_map: bool = True, |
| |
| s2_img_size: tuple = (16, 16), |
| s2_patch_size: int = 4, |
| s2_in_channels: int = 10, |
| s2_embed_dims: int = 1024, |
| s2_num_layers: int = 24, |
| s2_num_heads: int = 16, |
| s2_mlp_ratio: int = 4, |
| s2_out_indices: tuple = (5, 11, 17, 23), |
| s2_drop_path_rate: float = 0.3, |
| |
| s1_img_size: tuple = (16, 16), |
| s1_patch_size: int = 4, |
| s1_in_channels: int = 2, |
| s1_embed_dims: int = 1024, |
| s1_num_layers: int = 24, |
| s1_num_heads: int = 16, |
| |
| fusion_input_dims: int = 2816, |
| fusion_embed_dims: int = 1024, |
| fusion_num_layers: int = 24, |
| fusion_num_heads: int = 16, |
| fusion_with_cls_token: bool = True, |
| fusion_output_cls_token: bool = True, |
| |
| decode_in_channels: list = None, |
| decode_channels: int = 512, |
| decode_num_classes: int = 65, |
| |
| vocabulary_size: int = 64, |
| sources: list = None, |
| use_modal_vae: bool = True, |
| calendar_time: int = 366, |
| vae_subfolder: str = "modality_vae", |
| **kwargs, |
| ): |
| super().__init__(**kwargs) |
|
|
| |
| self.hr_arch = hr_arch |
| self.hr_img_size = hr_img_size |
| self.hr_patch_size = hr_patch_size |
| self.hr_in_channels = hr_in_channels |
| self.hr_window_size = hr_window_size |
| self.hr_drop_path_rate = hr_drop_path_rate |
| self.hr_out_indices = tuple(hr_out_indices) |
| self.hr_use_abs_pos_embed = hr_use_abs_pos_embed |
| self.hr_with_cp = hr_with_cp |
| self.hr_pad_small_map = hr_pad_small_map |
|
|
| |
| self.s2_img_size = tuple(s2_img_size) |
| self.s2_patch_size = s2_patch_size |
| self.s2_in_channels = s2_in_channels |
| self.s2_embed_dims = s2_embed_dims |
| self.s2_num_layers = s2_num_layers |
| self.s2_num_heads = s2_num_heads |
| self.s2_mlp_ratio = s2_mlp_ratio |
| self.s2_out_indices = tuple(s2_out_indices) |
| self.s2_drop_path_rate = s2_drop_path_rate |
|
|
| |
| self.s1_img_size = tuple(s1_img_size) |
| self.s1_patch_size = s1_patch_size |
| self.s1_in_channels = s1_in_channels |
| self.s1_embed_dims = s1_embed_dims |
| self.s1_num_layers = s1_num_layers |
| self.s1_num_heads = s1_num_heads |
|
|
| |
| self.fusion_input_dims = fusion_input_dims |
| self.fusion_embed_dims = fusion_embed_dims |
| self.fusion_num_layers = fusion_num_layers |
| self.fusion_num_heads = fusion_num_heads |
| self.fusion_with_cls_token = fusion_with_cls_token |
| self.fusion_output_cls_token = fusion_output_cls_token |
|
|
| |
| self.decode_in_channels = decode_in_channels or [704, 704, 1408, 2816, 1024] |
| self.decode_channels = decode_channels |
| self.decode_num_classes = decode_num_classes |
|
|
| |
| self.vocabulary_size = vocabulary_size |
| self.sources = sources or ["hr", "s2", "s1"] |
| self.use_modal_vae = use_modal_vae |
| self.calendar_time = calendar_time |
| self.vae_subfolder = vae_subfolder |
|
|