Instructions to use deepseek-ai/DeepSeek-R1 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use deepseek-ai/DeepSeek-R1 with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="deepseek-ai/DeepSeek-R1", trust_remote_code=True)
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-R1", trust_remote_code=True)
messages = [
    {"role": "user", "content": "Who are you?"},
]
inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=40)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Inference
HuggingChat
Notebooks
Google Colab
Kaggle
Local Apps Settings

vLLM

How to use deepseek-ai/DeepSeek-R1 with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "deepseek-ai/DeepSeek-R1"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "deepseek-ai/DeepSeek-R1",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/deepseek-ai/DeepSeek-R1

SGLang

How to use deepseek-ai/DeepSeek-R1 with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "deepseek-ai/DeepSeek-R1" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "deepseek-ai/DeepSeek-R1",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "deepseek-ai/DeepSeek-R1" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "deepseek-ai/DeepSeek-R1",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use deepseek-ai/DeepSeek-R1 with Docker Model Runner:
```
docker model run hf.co/deepseek-ai/DeepSeek-R1
```

Update config.json

#160

by jana0010 - opened Feb 19, 2025

base: refs/heads/main

←

from: refs/pr/160

Discussion Files changed

+52

-70

Files changed (1) hide show

config.json +52 -70

config.json CHANGED Viewed

@@ -1,70 +1,52 @@
-{
-  "architectures": [
-    "DeepseekV3ForCausalLM"
-  ],
-  "attention_bias": false,
-  "attention_dropout": 0.0,
-  "auto_map": {
-    "AutoConfig": "configuration_deepseek.DeepseekV3Config",
-    "AutoModel": "modeling_deepseek.DeepseekV3Model",
-    "AutoModelForCausalLM": "modeling_deepseek.DeepseekV3ForCausalLM"
-  },
-  "aux_loss_alpha": 0.001,
-  "bos_token_id": 0,
-  "eos_token_id": 1,
-  "ep_size": 1,
-  "first_k_dense_replace": 3,
-  "hidden_act": "silu",
-  "hidden_size": 7168,
-  "initializer_range": 0.02,
-  "intermediate_size": 18432,
-  "kv_lora_rank": 512,
-  "max_position_embeddings": 163840,
-  "model_type": "deepseek_v3",
-  "moe_intermediate_size": 2048,
-  "moe_layer_freq": 1,
-  "n_group": 8,
-  "n_routed_experts": 256,
-  "n_shared_experts": 1,
-  "norm_topk_prob": true,
-  "num_attention_heads": 128,
-  "num_experts_per_tok": 8,
-  "num_hidden_layers": 61,
-  "num_key_value_heads": 128,
-  "num_nextn_predict_layers": 1,
-  "pretraining_tp": 1,
-  "q_lora_rank": 1536,
-  "qk_nope_head_dim": 128,
-  "qk_rope_head_dim": 64,
-  "quantization_config": {
-    "activation_scheme": "dynamic",
-    "fmt": "e4m3",
-    "quant_method": "fp8",
-    "weight_block_size": [
-      128,
-      128
-    ]
-  },
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": {
-    "beta_fast": 32,
-    "beta_slow": 1,
-    "factor": 40,
-    "mscale": 1.0,
-    "mscale_all_dim": 1.0,
-    "original_max_position_embeddings": 4096,
-    "type": "yarn"
-  },
-  "rope_theta": 10000,
-  "routed_scaling_factor": 2.5,
-  "scoring_func": "sigmoid",
-  "seq_aux": true,
-  "tie_word_embeddings": false,
-  "topk_group": 4,
-  "topk_method": "noaux_tc",
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.46.3",
-  "use_cache": true,
-  "v_head_dim": 128,
-  "vocab_size": 129280
-}

+def load_model_with_quantization_fallback(
+    model_name: str = "deepseek-ai/DeepSeek-R1",
+    trust_remote_code: bool = True,
+    device_map: Optional[Union[str, Dict[str, Any]]] = "auto",
+    **kwargs
+) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
+  try:
+      model = AutoModel.from_pretrained(
+          model_name,
+          trust_remote_code=trust_remote_code,
+          device_map=device_map,
+          **kwargs
+      )
+      tokenizer = AutoTokenizer.from_pretrained(model_name)
+      logger.info("Model loaded successfully with original configuration")
+      return model, tokenizer
+  except ValueError as e:
+      if "Unknown quantization type" in str(e):
+          logger.warning(
+              "Quantization type not supported directly. "
+              "Attempting to load without quantization..."
+          )
+          config = AutoConfig.from_pretrained(
+              model_name,
+              trust_remote_code=trust_remote_code
+          )
+          if hasattr(config, "quantization_config"):
+              delattr(config, "quantization_config")
+          try:
+              model = AutoModel.from_pretrained(
+                  model_name,
+                  config=config,
+                  trust_remote_code=trust_remote_code,
+                  device_map=device_map,
+                  **kwargs
+              )
+              tokenizer = AutoTokenizer.from_pretrained(
+                  model_name,
+                  trust_remote_code=trust_remote_code
+              )
+              logger.info("Model loaded successfully without quantization")
+              return model, tokenizer
+          except Exception as inner_e:
+              logger.error(f"Failed to load model without quantization: {str(inner_e)}")
+              raise
+      else:
+          logger.error(f"Unexpected error during model loading: {str(e)}")
+          raise