Image-Text-to-Text
Transformers
Safetensors
kimi_k25
feature-extraction
compressed-tensors
conversational
custom_code
Instructions to use LittleDesignSolution/Kimi-K2.6 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use LittleDesignSolution/Kimi-K2.6 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("image-text-to-text", model="LittleDesignSolution/Kimi-K2.6", trust_remote_code=True) messages = [ { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}, {"type": "text", "text": "What animal is on the candy?"} ] }, ] pipe(text=messages)# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("LittleDesignSolution/Kimi-K2.6", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use LittleDesignSolution/Kimi-K2.6 with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "LittleDesignSolution/Kimi-K2.6" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "LittleDesignSolution/Kimi-K2.6", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }'Use Docker
docker model run hf.co/LittleDesignSolution/Kimi-K2.6
- SGLang
How to use LittleDesignSolution/Kimi-K2.6 with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "LittleDesignSolution/Kimi-K2.6" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "LittleDesignSolution/Kimi-K2.6", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "LittleDesignSolution/Kimi-K2.6" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "LittleDesignSolution/Kimi-K2.6", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }' - Docker Model Runner
How to use LittleDesignSolution/Kimi-K2.6 with Docker Model Runner:
docker model run hf.co/LittleDesignSolution/Kimi-K2.6
| from transformers.configuration_utils import PretrainedConfig | |
| try: | |
| from configuration_deepseek import DeepseekV3Config | |
| except ImportError: | |
| from .configuration_deepseek import DeepseekV3Config | |
| class KimiK25VisionConfig(PretrainedConfig): | |
| def __init__( | |
| self, | |
| patch_size: int = 14, | |
| init_pos_emb_height: int = 64, | |
| init_pos_emb_width: int = 64, | |
| init_pos_emb_time: int = 4, | |
| pos_emb_type: str = 'divided_fixed', | |
| vt_num_attention_heads: int = 16, | |
| vt_num_hidden_layers: int = 27, | |
| vt_hidden_size: int = 1152, | |
| vt_intermediate_size: int = 4304, | |
| merge_kernel_size: tuple = (2, 2), | |
| video_attn_type: str = 'spatial_temporal', | |
| merge_type: str = 'sd2_tpool', | |
| _attn_implementation: str = 'flash_attention_2', | |
| # MM Projector parameters | |
| mm_projector_type: str = 'patchmerger', | |
| mm_hidden_size: int | None = None, | |
| projector_hidden_act: str = "gelu", | |
| projector_ln_eps: float = 1e-5, | |
| # Other parameters | |
| ignore_index: int = -100, | |
| media_placeholder_token_id: int = 163605, | |
| pad_token_id: int = 0, | |
| use_unified_vision_chunk: bool = True, | |
| video_placeholder="<|kimi_k25_video_placeholder|>", | |
| text_hidden_size=7168, | |
| **vision_config_kwargs): | |
| self.patch_size = patch_size | |
| self.init_pos_emb_height = init_pos_emb_height | |
| self.init_pos_emb_width = init_pos_emb_width | |
| self.init_pos_emb_time = init_pos_emb_time | |
| self.pos_emb_type = pos_emb_type | |
| self.vt_num_attention_heads = vt_num_attention_heads | |
| self.vt_num_hidden_layers = vt_num_hidden_layers | |
| self.vt_hidden_size = vt_hidden_size | |
| self.vt_intermediate_size = vt_intermediate_size | |
| self.merge_kernel_size = merge_kernel_size | |
| self.video_attn_type = video_attn_type | |
| self.merge_type = merge_type | |
| self._attn_implementation = _attn_implementation | |
| # MM Projector config | |
| self.mm_projector_type = mm_projector_type | |
| self.mm_hidden_size = mm_hidden_size if mm_hidden_size is not None else vt_hidden_size | |
| self.projector_hidden_act = projector_hidden_act | |
| self.projector_ln_eps = projector_ln_eps | |
| self.text_hidden_size = text_hidden_size | |
| class KimiK25Config(PretrainedConfig): | |
| """Kimi-K2.5 model configuration. | |
| Args: | |
| text_config (dict | DeepseekV3Config): Configuration for the text model. | |
| Vision Tower Parameters (from MoonViT3dConfig): | |
| patch_size (int): Patch size for vision tower. | |
| init_pos_emb_height (int): Initial position embedding height. | |
| init_pos_emb_width (int): Initial position embedding width. | |
| init_pos_emb_time (int): Initial position embedding time dimension. | |
| pos_emb_type (str): Type of position embedding. | |
| vt_num_attention_heads (int): Number of attention heads in vision tower. | |
| vt_num_hidden_layers (int): Number of hidden layers in vision tower. | |
| vt_hidden_size (int): Hidden size of vision tower. | |
| vt_intermediate_size (int): Intermediate size in vision tower FFN. | |
| merge_kernel_size (tuple): Kernel size for patch merging. | |
| video_attn_type (str): Type of video attention. | |
| merge_type (str): Type of merge operation. | |
| _attn_implementation (str): Attention implementation type. | |
| MM Projector Parameters (from MultiModalProjectorConfig): | |
| mm_projector_type (str): Type of multimodal projector. | |
| mm_hidden_size (int): Hidden size from vision tower (should match vt_hidden_size). | |
| projector_hidden_act (str): Activation function for projector. | |
| projector_ln_eps (float): Layer norm epsilon for projector. | |
| Other Parameters: | |
| ignore_index (int): The ignore index for the loss function. | |
| media_placeholder_token_id (int): The token ID to use for media placeholders. | |
| pad_token_id (int): The token ID to use for padding. | |
| """ | |
| model_type = "kimi_k25" | |
| def __init__( | |
| self, | |
| text_config: dict | DeepseekV3Config = None, | |
| vision_config: dict | KimiK25VisionConfig = None, | |
| # Other parameters | |
| ignore_index: int = -100, | |
| media_placeholder_token_id: int = 163605, | |
| pad_token_id: int = 0, | |
| use_unified_vision_chunk: bool = True, | |
| video_placeholder="<|kimi_k25_video_placeholder|>", | |
| **kwargs, | |
| ): | |
| if isinstance(text_config, dict): | |
| text_config = DeepseekV3Config(**text_config) | |
| if isinstance(vision_config, dict): | |
| vision_config = KimiK25VisionConfig(**vision_config) | |
| self.text_config = text_config | |
| self.vision_config = vision_config | |
| # Other config | |
| self.ignore_index = ignore_index | |
| self.media_placeholder_token_id = media_placeholder_token_id | |
| self.use_unified_vision_chunk = use_unified_vision_chunk | |
| self.video_placeholder = video_placeholder | |
| if getattr(self.text_config, "quantization_config", None) is not None: | |
| self.quantization_config = self.text_config.quantization_config | |
| super().__init__(pad_token_id=pad_token_id, **kwargs) | |