|
|
from transformers.configuration_utils import PretrainedConfig |
|
|
|
|
|
try: |
|
|
from transformers import DeepseekV3Config |
|
|
except ImportError: |
|
|
from .configuration_deepseek import DeepseekV3Config |
|
|
|
|
|
|
|
|
class KimiK25VisionConfig(PretrainedConfig): |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
patch_size: int = 14, |
|
|
init_pos_emb_height: int = 64, |
|
|
init_pos_emb_width: int = 64, |
|
|
init_pos_emb_time: int = 4, |
|
|
pos_emb_type: str = 'divided_fixed', |
|
|
vt_num_attention_heads: int = 16, |
|
|
vt_num_hidden_layers: int = 27, |
|
|
vt_hidden_size: int = 1152, |
|
|
vt_intermediate_size: int = 4304, |
|
|
merge_kernel_size: tuple = (2, 2), |
|
|
video_attn_type: str = 'spatial_temporal', |
|
|
merge_type: str = 'sd2_tpool', |
|
|
_attn_implementation: str = 'flash_attention_2', |
|
|
|
|
|
mm_projector_type: str = 'patchmerger', |
|
|
mm_hidden_size: int | None = None, |
|
|
projector_hidden_act: str = "gelu", |
|
|
projector_ln_eps: float = 1e-5, |
|
|
|
|
|
ignore_index: int = -100, |
|
|
media_placeholder_token_id: int = 163605, |
|
|
pad_token_id: int = 0, |
|
|
use_unified_vision_chunk: bool = True, |
|
|
video_placeholder="<|kimi_k25_video_placeholder|>", |
|
|
text_hidden_size=7168, |
|
|
**vision_config_kwargs): |
|
|
|
|
|
self.patch_size = patch_size |
|
|
self.init_pos_emb_height = init_pos_emb_height |
|
|
self.init_pos_emb_width = init_pos_emb_width |
|
|
self.init_pos_emb_time = init_pos_emb_time |
|
|
self.pos_emb_type = pos_emb_type |
|
|
self.vt_num_attention_heads = vt_num_attention_heads |
|
|
self.vt_num_hidden_layers = vt_num_hidden_layers |
|
|
self.vt_hidden_size = vt_hidden_size |
|
|
self.vt_intermediate_size = vt_intermediate_size |
|
|
self.merge_kernel_size = merge_kernel_size |
|
|
self.video_attn_type = video_attn_type |
|
|
self.merge_type = merge_type |
|
|
self._attn_implementation = _attn_implementation |
|
|
|
|
|
|
|
|
self.mm_projector_type = mm_projector_type |
|
|
self.mm_hidden_size = mm_hidden_size if mm_hidden_size is not None else vt_hidden_size |
|
|
self.projector_hidden_act = projector_hidden_act |
|
|
self.projector_ln_eps = projector_ln_eps |
|
|
self.text_hidden_size = text_hidden_size |
|
|
|
|
|
|
|
|
class KimiK25Config(PretrainedConfig): |
|
|
"""Kimi-K2.5 model configuration. |
|
|
|
|
|
Args: |
|
|
text_config (dict | DeepseekV3Config): Configuration for the text model. |
|
|
|
|
|
Vision Tower Parameters (from MoonViT3dConfig): |
|
|
patch_size (int): Patch size for vision tower. |
|
|
init_pos_emb_height (int): Initial position embedding height. |
|
|
init_pos_emb_width (int): Initial position embedding width. |
|
|
init_pos_emb_time (int): Initial position embedding time dimension. |
|
|
pos_emb_type (str): Type of position embedding. |
|
|
vt_num_attention_heads (int): Number of attention heads in vision tower. |
|
|
vt_num_hidden_layers (int): Number of hidden layers in vision tower. |
|
|
vt_hidden_size (int): Hidden size of vision tower. |
|
|
vt_intermediate_size (int): Intermediate size in vision tower FFN. |
|
|
merge_kernel_size (tuple): Kernel size for patch merging. |
|
|
video_attn_type (str): Type of video attention. |
|
|
merge_type (str): Type of merge operation. |
|
|
_attn_implementation (str): Attention implementation type. |
|
|
|
|
|
MM Projector Parameters (from MultiModalProjectorConfig): |
|
|
mm_projector_type (str): Type of multimodal projector. |
|
|
mm_hidden_size (int): Hidden size from vision tower (should match vt_hidden_size). |
|
|
projector_hidden_act (str): Activation function for projector. |
|
|
projector_ln_eps (float): Layer norm epsilon for projector. |
|
|
|
|
|
Other Parameters: |
|
|
ignore_index (int): The ignore index for the loss function. |
|
|
media_placeholder_token_id (int): The token ID to use for media placeholders. |
|
|
pad_token_id (int): The token ID to use for padding. |
|
|
""" |
|
|
|
|
|
model_type = "kimi_k25" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
text_config: dict | DeepseekV3Config = None, |
|
|
vision_config: dict | KimiK25VisionConfig = None, |
|
|
|
|
|
ignore_index: int = -100, |
|
|
media_placeholder_token_id: int = 163605, |
|
|
pad_token_id: int = 0, |
|
|
use_unified_vision_chunk: bool = True, |
|
|
video_placeholder="<|kimi_k25_video_placeholder|>", |
|
|
**kwargs, |
|
|
): |
|
|
if isinstance(text_config, dict): |
|
|
text_config = DeepseekV3Config(**text_config) |
|
|
if isinstance(vision_config, dict): |
|
|
vision_config = KimiK25VisionConfig(**vision_config) |
|
|
self.text_config = text_config |
|
|
self.vision_config = vision_config |
|
|
|
|
|
self.ignore_index = ignore_index |
|
|
self.media_placeholder_token_id = media_placeholder_token_id |
|
|
self.use_unified_vision_chunk = use_unified_vision_chunk |
|
|
self.video_placeholder = video_placeholder |
|
|
if getattr(self.text_config, "quantization_config", None) is not None: |
|
|
self.quantization_config = self.text_config.quantization_config |
|
|
|
|
|
super().__init__(pad_token_id=pad_token_id, **kwargs) |
|
|
|