Instructions to use openbmb/MiniCPM-SALA with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use openbmb/MiniCPM-SALA with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="openbmb/MiniCPM-SALA", trust_remote_code=True)
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("openbmb/MiniCPM-SALA", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use openbmb/MiniCPM-SALA with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "openbmb/MiniCPM-SALA"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "openbmb/MiniCPM-SALA",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/openbmb/MiniCPM-SALA

SGLang

How to use openbmb/MiniCPM-SALA with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "openbmb/MiniCPM-SALA" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "openbmb/MiniCPM-SALA",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "openbmb/MiniCPM-SALA" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "openbmb/MiniCPM-SALA",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use openbmb/MiniCPM-SALA with Docker Model Runner:
```
docker model run hf.co/openbmb/MiniCPM-SALA
```

Fix _init_rope compatibility with transformers >= 5.x rope_scaling standardization

#10

by DennisHuang648 - opened 4 days ago

base: refs/heads/main

←

from: refs/pr/10

Discussion Files changed

+29

-13

Files changed (1) hide show

modeling_minicpm_sala.py +29 -13

modeling_minicpm_sala.py CHANGED Viewed

@@ -877,15 +877,23 @@ class MiniCPMAttention(nn.Module):
             )
     def _init_rope(self):
-        if self.config.rope_scaling is None:
             self.rotary_emb = MiniCPMRotaryEmbedding(
                 self.head_dim,
                 max_position_embeddings=self.max_position_embeddings,
                 base=self.rope_theta,
             )
         else:
-            scaling_type = self.config.rope_scaling["rope_type"]
-            scaling_factor = self.config.rope_scaling.get("factor", None)
             if scaling_type == "linear":
                 self.rotary_emb = MiniCPMLinearScalingRotaryEmbedding(
                     self.head_dim,
@@ -904,10 +912,10 @@ class MiniCPMAttention(nn.Module):
                 self.rotary_emb = MiniCPMLongRoPE(
                     self.head_dim,
                     max_position_embeddings=self.max_position_embeddings,
-                    short_factor=self.config.rope_scaling["short_factor"],
-                    long_factor=self.config.rope_scaling["long_factor"],
                     base=self.rope_theta,
-                    original_max_position_embeddings=self.config.rope_scaling[
                         "original_max_position_embeddings"
                     ],
                 )
@@ -2142,15 +2150,23 @@ class LightningAttention(nn.Module):
         self._init_rope()
     def _init_rope(self):
-        if self.config.rope_scaling is None:
             self.rotary_emb = MiniCPMRotaryEmbedding(
                 self.head_dim,
                 max_position_embeddings=self.config.max_position_embeddings,
                 base=self.config.rope_theta,
             )
         else:
-            scaling_type = self.config.rope_scaling["rope_type"]
-            scaling_factor = self.config.rope_scaling.get("factor", None)
             if scaling_type == "linear":
                 self.rotary_emb = MiniCPMLinearScalingRotaryEmbedding(
                     self.head_dim,
@@ -2169,10 +2185,10 @@ class LightningAttention(nn.Module):
                 self.rotary_emb = MiniCPMLongRoPE(
                     self.head_dim,
                     max_position_embeddings=self.config.max_position_embeddings,
-                    short_factor=self.config.rope_scaling["short_factor"],
-                    long_factor=self.config.rope_scaling["long_factor"],
                     base=self.config.rope_theta,
-                    original_max_position_embeddings=self.config.rope_scaling[
                         "original_max_position_embeddings"
                     ],
                 )
@@ -3274,4 +3290,4 @@ class MiniCPMSALAForSequenceClassification(MiniCPMSALAPreTrainedModel):
             past_key_values=transformer_outputs.past_key_values,
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
-        )

             )
     def _init_rope(self):
+        # transformers>=4.43 standardizes rope_scaling: a missing/None
+        # rope_scaling is auto-filled to {"rope_type": "default", "factor": 1.0}
+        # at config-load time. Treat both the original None case and the
+        # standardized "default" as no scaling so loading does not raise on
+        # newer transformers releases.
+        rope_scaling = self.config.rope_scaling
+        scaling_type = None
+        if isinstance(rope_scaling, dict):
+            scaling_type = rope_scaling.get("type") or rope_scaling.get("rope_type")
+        if rope_scaling is None or scaling_type in (None, "default"):
             self.rotary_emb = MiniCPMRotaryEmbedding(
                 self.head_dim,
                 max_position_embeddings=self.max_position_embeddings,
                 base=self.rope_theta,
             )
         else:
+            scaling_factor = rope_scaling.get("factor", None)
             if scaling_type == "linear":
                 self.rotary_emb = MiniCPMLinearScalingRotaryEmbedding(
                     self.head_dim,
                 self.rotary_emb = MiniCPMLongRoPE(
                     self.head_dim,
                     max_position_embeddings=self.max_position_embeddings,
+                    short_factor=rope_scaling["short_factor"],
+                    long_factor=rope_scaling["long_factor"],
                     base=self.rope_theta,
+                    original_max_position_embeddings=rope_scaling[
                         "original_max_position_embeddings"
                     ],
                 )
         self._init_rope()
     def _init_rope(self):
+        # transformers>=4.43 standardizes rope_scaling: a missing/None
+        # rope_scaling is auto-filled to {"rope_type": "default", "factor": 1.0}
+        # at config-load time. Treat both the original None case and the
+        # standardized "default" as no scaling so loading does not raise on
+        # newer transformers releases.
+        rope_scaling = self.config.rope_scaling
+        scaling_type = None
+        if isinstance(rope_scaling, dict):
+            scaling_type = rope_scaling.get("type") or rope_scaling.get("rope_type")
+        if rope_scaling is None or scaling_type in (None, "default"):
             self.rotary_emb = MiniCPMRotaryEmbedding(
                 self.head_dim,
                 max_position_embeddings=self.config.max_position_embeddings,
                 base=self.config.rope_theta,
             )
         else:
+            scaling_factor = rope_scaling.get("factor", None)
             if scaling_type == "linear":
                 self.rotary_emb = MiniCPMLinearScalingRotaryEmbedding(
                     self.head_dim,
                 self.rotary_emb = MiniCPMLongRoPE(
                     self.head_dim,
                     max_position_embeddings=self.config.max_position_embeddings,
+                    short_factor=rope_scaling["short_factor"],
+                    long_factor=rope_scaling["long_factor"],
                     base=self.config.rope_theta,
+                    original_max_position_embeddings=rope_scaling[
                         "original_max_position_embeddings"
                     ],
                 )
             past_key_values=transformer_outputs.past_key_values,
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
+        )