# coding=utf-8
# Copyright 2026 NAVER Cloud Corp. and the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""HyperCLOVAX SEED CLIP model configuration.

Config classes for HyperCLOVAX SEED CLIP with vision encoder + SiglipText encoder.
"""

from transformers import AutoConfig
from transformers.configuration_utils import PretrainedConfig
from transformers.models.siglip.configuration_siglip import SiglipTextConfig
from transformers.utils import logging

logger = logging.get_logger(__name__)


class HyperCLOVAXSeedCLIPVisionConfig(PretrainedConfig):
    """Vision config for HyperCLOVAX SEED CLIP.

    Includes parameters for the vision encoder transformer and
    the Siglip2-style attention pooling head.

    Args:
        depth (`int`, *optional*, defaults to 32):
            Number of transformer blocks.
        hidden_size (`int`, *optional*, defaults to 1280):
            Dimension of the hidden representations.
        hidden_act (`str`, *optional*, defaults to `"silu"`):
            Activation function used in the MLP blocks.
        intermediate_size (`int`, *optional*, defaults to 3456):
            Dimension of the MLP intermediate representations.
        num_heads (`int`, *optional*, defaults to 16):
            Number of attention heads.
        in_channels (`int`, *optional*, defaults to 3):
            Number of input image channels.
        patch_size (`int`, *optional*, defaults to 16):
            Spatial patch size (height and width).
        spatial_merge_size (`int`, *optional*, defaults to 2):
            Number of patches to merge spatially.
        temporal_patch_size (`int`, *optional*, defaults to 2):
            Number of frames to merge per temporal patch.
        tokens_per_second (`int`, *optional*, defaults to 4):
            Number of temporal tokens representing one second of video.
        window_size (`int`, *optional*, defaults to 128):
            Window size (in pixels) for window attention blocks.
        fullatt_block_indexes (`List[int]`, *optional*, defaults to all blocks):
            Indices of transformer blocks that use full (global) attention.
            Defaults to all blocks (full attention everywhere).
        initializer_range (`float`, *optional*, defaults to 0.02):
            Standard deviation for weight initialization.
        attn_pool_heads (`int`, *optional*, defaults to 16):
            Number of attention heads in the pooling head.
        attn_pool_mlp_ratio (`float`, *optional*, defaults to 4.0):
            MLP expansion ratio in the pooling head.
        attn_implementation (`str`, *optional*, defaults to `"eager"`):
            Attention implementation (`"eager"`, `"sdpa"`, or `"flash_attention_2"`).
    """

    model_type = "hyperclovax_seed_clip_vision"

    def __init__(
        self,
        depth=32,
        hidden_size=1280,
        hidden_act="silu",
        intermediate_size=3456,
        num_heads=16,
        in_channels=3,
        patch_size=16,
        spatial_merge_size=2,
        temporal_patch_size=2,
        tokens_per_second=4,
        window_size=128,
        fullatt_block_indexes=None,
        initializer_range=0.02,
        attn_pool_heads=16,
        attn_pool_mlp_ratio=4.0,
        attn_implementation="eager",
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.depth = depth
        self.hidden_size = hidden_size
        self.hidden_act = hidden_act
        self.intermediate_size = intermediate_size
        self.num_heads = num_heads
        self.in_channels = in_channels
        self.patch_size = patch_size
        self.spatial_merge_size = spatial_merge_size
        self.temporal_patch_size = temporal_patch_size
        self.tokens_per_second = tokens_per_second
        self.window_size = window_size
        self.fullatt_block_indexes = (
            fullatt_block_indexes if fullatt_block_indexes is not None else list(range(depth))
        )
        self.initializer_range = initializer_range
        self.attn_pool_heads = attn_pool_heads
        self.attn_pool_mlp_ratio = attn_pool_mlp_ratio
        self.attn_implementation = attn_implementation


AutoConfig.register("hyperclovax_seed_clip_vision", HyperCLOVAXSeedCLIPVisionConfig)


class HyperCLOVAXSeedCLIPConfig(PretrainedConfig):
    """Config for HyperCLOVAX SEED CLIP.

    Combines a SiglipTextConfig (text) and HyperCLOVAXSeedCLIPVisionConfig (vision).
    """

    model_type = "hyperclovax_seed_clip"
    sub_configs = {"text_config": SiglipTextConfig, "vision_config": HyperCLOVAXSeedCLIPVisionConfig}

    def __init__(self, text_config=None, vision_config=None, **kwargs):
        super().__init__(**kwargs)

        if text_config is None:
            text_config = {}
            logger.info("text_config is None. Using SiglipTextConfig defaults.")
        if vision_config is None:
            vision_config = {}
            logger.info("vision_config is None. Using HyperCLOVAXSeedCLIPVisionConfig defaults.")

        self.text_config = SiglipTextConfig(**text_config)
        self.vision_config = HyperCLOVAXSeedCLIPVisionConfig(**vision_config)
        self.initializer_factor = 1.0

    @classmethod
    def from_text_vision_configs(
        cls,
        text_config: SiglipTextConfig,
        vision_config: HyperCLOVAXSeedCLIPVisionConfig,
        **kwargs,
    ):
        return cls(
            text_config=text_config.to_dict(),
            vision_config=vision_config.to_dict(),
            **kwargs,
        )