Instructions to use openbmb/MiniCPM4-8B with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use openbmb/MiniCPM4-8B with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="openbmb/MiniCPM4-8B", trust_remote_code=True)
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("openbmb/MiniCPM4-8B", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps Settings

vLLM

How to use openbmb/MiniCPM4-8B with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "openbmb/MiniCPM4-8B"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "openbmb/MiniCPM4-8B",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/openbmb/MiniCPM4-8B

SGLang

How to use openbmb/MiniCPM4-8B with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "openbmb/MiniCPM4-8B" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "openbmb/MiniCPM4-8B",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "openbmb/MiniCPM4-8B" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "openbmb/MiniCPM4-8B",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use openbmb/MiniCPM4-8B with Docker Model Runner:
```
docker model run hf.co/openbmb/MiniCPM4-8B
```

Fix: resolve transformers version compatibility for DynamicLayer and cache initialization

#18

by FALcon6 - opened Mar 21

base: refs/heads/main

←

from: refs/pr/18

Discussion Files changed

+153

-10

Files changed (1) hide show

modeling_minicpm.py +153 -10

modeling_minicpm.py CHANGED Viewed

@@ -16,6 +16,7 @@
 import math
 import re
 import warnings
 from typing import Any, Dict, List, Optional, Tuple, Union
 import torch
@@ -24,7 +25,7 @@ import torch.utils.checkpoint
 from torch import  nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers.activations import ACT2FN
-from transformers.cache_utils import Cache, DynamicCache, CacheLayerMixin, DynamicLayer
 from transformers.modeling_attn_mask_utils import (
     AttentionMaskConverter,
     _prepare_4d_attention_mask,
@@ -233,6 +234,145 @@ class CompressK(torch.nn.Module):
         return compressed_k, cu_seqlens_compressed
 class InfLLMv2CacheLayer(DynamicLayer):
     def __init__(self):
@@ -1814,18 +1954,21 @@ class MiniCPMModel(MiniCPMPreTrainedModel):
         past_key_values_length = 0
         if use_cache:
-            use_legacy_cache = not isinstance(past_key_values, Cache)
-            if use_legacy_cache:
                 raise ValueError(
                     'You must use the new past_key_values format, such as the Cache class, instead of the old tuple format.'
                 )
             # Calculate the usable length of past key values
-            past_key_values_length = past_key_values.get_seq_length() if isinstance(past_key_values, InfLLMv2Cache) else 0
-            # Initialize InfLLMv2Cache if needed
-            if self.config.sparse_config is not None and torch.cuda.is_available() and past_key_values_length == 0:
-                past_key_values = InfLLMv2Cache(config = self.config, num_hidden_layers=self.config.num_hidden_layers)
         if position_ids is None:
             device = input_ids.device if input_ids is not None else inputs_embeds.device
@@ -1907,7 +2050,7 @@ class MiniCPMModel(MiniCPMPreTrainedModel):
         next_cache = None
         if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
         if not return_dict:
             return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
         return BaseModelOutputWithPast(

 import math
 import re
 import warnings
+from abc import ABC, abstractmethod
 from typing import Any, Dict, List, Optional, Tuple, Union
 import torch
 from torch import  nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
 from transformers.modeling_attn_mask_utils import (
     AttentionMaskConverter,
     _prepare_4d_attention_mask,
         return compressed_k, cu_seqlens_compressed
+class CacheLayerMixin(ABC):
+    """Base, abstract class for a single layer's cache."""
+    is_compileable = False
+    def __init__(self):
+        self.keys: torch.Tensor | None = None
+        self.values: torch.Tensor | None = None
+        self.is_initialized = False
+    def __repr__(self):
+        return f"{self.__class__.__name__}"
+    @abstractmethod
+    def lazy_initialization(self, key_states: torch.Tensor, value_states: torch.Tensor) -> None: ...
+    @abstractmethod
+    def update(
+        self, key_states: torch.Tensor, value_states: torch.Tensor, *args, **kwargs
+    ) -> tuple[torch.Tensor, torch.Tensor]: ...
+    @abstractmethod
+    def get_mask_sizes(self, query_length: int) -> tuple[int, int]: ...
+    @abstractmethod
+    def get_seq_length(self) -> int: ...
+    @abstractmethod
+    def get_max_cache_shape(self) -> int: ...
+    def offload(self):
+        """Offload this layer's data to CPU device."""
+        if self.is_initialized:
+            self.keys = self.keys.to("cpu", non_blocking=True)
+            self.values = self.values.to("cpu", non_blocking=True)
+    def prefetch(self):
+        """In case of layer offloading, this allows to move the data back to the layer's device ahead of time."""
+        if self.is_initialized and self.keys.device != self.device:
+            self.keys = self.keys.to(self.device, non_blocking=True)
+            self.values = self.values.to(self.device, non_blocking=True)
+    def reset(self) -> None:
+        """Resets the cache values while preserving the objects"""
+        if self.is_initialized:
+            self.keys.zero_()
+            self.values.zero_()
+        # This attribute is set on several Layers
+        if hasattr(self, "cumulative_length"):
+            # It can either be an int for dynamic layers, or a tensor for static layers
+            if isinstance(self.cumulative_length, int):
+                self.cumulative_length = 0
+            else:
+                self.cumulative_length.zero_()
+    def reorder_cache(self, beam_idx: torch.LongTensor) -> None:
+        """Reorders this layer's cache for beam search."""
+        if self.get_seq_length() > 0:
+            self.keys = self.keys.index_select(0, beam_idx.to(self.keys.device))
+            self.values = self.values.index_select(0, beam_idx.to(self.values.device))
+class DynamicLayer(CacheLayerMixin):
+    """
+    A cache layer that grows dynamically as more tokens are generated. This is the default for generative models.
+    It stores the key and value states as tensors of shape `[batch_size, num_heads, seq_len, head_dim]`.
+    """
+    is_sliding = False
+    def lazy_initialization(self, key_states: torch.Tensor, value_states: torch.Tensor) -> None:
+        self.dtype, self.device = key_states.dtype, key_states.device
+        self.keys = torch.tensor([], dtype=self.dtype, device=self.device)
+        self.values = torch.tensor([], dtype=self.dtype, device=self.device)
+        self.is_initialized = True
+    def update(
+        self, key_states: torch.Tensor, value_states: torch.Tensor, *args, **kwargs
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Update the key and value caches in-place, and return the necessary keys and value states.
+        Args:
+            key_states (`torch.Tensor`): The new key states to cache.
+            value_states (`torch.Tensor`): The new value states to cache.
+        Returns:
+            tuple[`torch.Tensor`, `torch.Tensor`]: The key and value states.
+        """
+        # Lazy initialization
+        if not self.is_initialized:
+            self.lazy_initialization(key_states, value_states)
+        self.keys = torch.cat([self.keys, key_states], dim=-2)
+        self.values = torch.cat([self.values, value_states], dim=-2)
+        return self.keys, self.values
+    def get_mask_sizes(self, query_length: int) -> tuple[int, int]:
+        """Return the length and offset of the cache, used to generate the mask"""
+        kv_offset = 0
+        kv_length = self.get_seq_length() + query_length
+        return kv_length, kv_offset
+    def get_seq_length(self) -> int:
+        """Returns the sequence length of the cached states."""
+        if not self.is_initialized or self.keys.numel() == 0:
+            return 0
+        return self.keys.shape[-2]
+    def get_max_cache_shape(self) -> int:
+        """Returns the maximum sequence length of the cache object. DynamicLayer does not have a maximum length."""
+        return -1
+    def crop(self, max_length: int) -> None:
+        """
+        Crop the past key values up to a new `max_length` in terms of tokens. `max_length` can also be negative
+        to remove `max_length` tokens.
+        """
+        if max_length < 0:
+            max_length = self.get_seq_length() - abs(max_length)
+        if self.get_seq_length() <= max_length:
+            return
+        self.keys = self.keys[..., :max_length, :]
+        self.values = self.values[..., :max_length, :]
+    def batch_repeat_interleave(self, repeats: int) -> None:
+        """Repeat the cache `repeats` times in the batch dimension."""
+        if self.get_seq_length() > 0:
+            self.keys = self.keys.repeat_interleave(repeats, dim=0)
+            self.values = self.values.repeat_interleave(repeats, dim=0)
+    def batch_select_indices(self, indices: torch.Tensor) -> None:
+        """Only keep the `indices` in the batch dimension of the cache."""
+        if self.get_seq_length() > 0:
+            self.keys = self.keys[indices, ...]
+            self.values = self.values[indices, ...]
 class InfLLMv2CacheLayer(DynamicLayer):
     def __init__(self):
         past_key_values_length = 0
         if use_cache:
+            # Reject old tuple-style cache, but allow None (first forward pass)
+            if past_key_values is not None and not isinstance(past_key_values, Cache):
                 raise ValueError(
                     'You must use the new past_key_values format, such as the Cache class, instead of the old tuple format.'
                 )
+            # Initialize cache if None (first forward pass)
+            if past_key_values is None:
+                if self.config.sparse_config is not None and torch.cuda.is_available():
+                    past_key_values = InfLLMv2Cache(config=self.config, num_hidden_layers=self.config.num_hidden_layers)
+                else:
+                    past_key_values = DynamicCache()
             # Calculate the usable length of past key values
+            past_key_values_length = past_key_values.get_seq_length()
         if position_ids is None:
             device = input_ids.device if input_ids is not None else inputs_embeds.device
         next_cache = None
         if use_cache:
+            next_cache = next_decoder_cache
         if not return_dict:
             return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
         return BaseModelOutputWithPast(