Instructions to use inclusionAI/Ling-lite with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use inclusionAI/Ling-lite with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="inclusionAI/Ling-lite", trust_remote_code=True)
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("inclusionAI/Ling-lite", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use inclusionAI/Ling-lite with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "inclusionAI/Ling-lite"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "inclusionAI/Ling-lite",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/inclusionAI/Ling-lite

SGLang

How to use inclusionAI/Ling-lite with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "inclusionAI/Ling-lite" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "inclusionAI/Ling-lite",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "inclusionAI/Ling-lite" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "inclusionAI/Ling-lite",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use inclusionAI/Ling-lite with Docker Model Runner:
```
docker model run hf.co/inclusionAI/Ling-lite
```

zhujiangang commited on Mar 4, 2025

Commit

671ff9b

verified ·

1 Parent(s): ee5855c

Update modeling_bailing_moe.py

Browse files

Files changed (1) hide show

modeling_bailing_moe.py +13 -8

modeling_bailing_moe.py CHANGED Viewed

@@ -117,8 +117,8 @@ class BailingMoeRMSNorm(nn.Module):
         hidden_states = hidden_states.to(torch.float32)
         variance = hidden_states.pow(2).mean(-1, keepdim=True)
         hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return (self.weight.float() * hidden_states).to(input_dtype)
 ALL_LAYERNORM_LAYERS.append(BailingMoeRMSNorm)
@@ -495,7 +495,7 @@ class BailingMoeAttention(nn.Module):
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
-        attn_weights = torch.matmul(query_states / math.sqrt(self.head_dim), key_states.transpose(2, 3))
         if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
             raise ValueError(
@@ -825,7 +825,6 @@ class BailingMoeSdpaAttention(BailingMoeAttention):
             dropout_p=self.attention_dropout if self.training else 0.0,
             # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
             is_causal=self.is_causal and attention_mask is None and q_len > 1,
-            # enable_gqa=True
         )
         attn_output = attn_output.transpose(1, 2).contiguous()
@@ -847,6 +846,7 @@ class BailingMoeDecoderLayer(nn.Module):
     def __init__(self, config: BailingMoeConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
         self.attention = BAILING_MOE_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
         self.mlp = (
@@ -1167,7 +1167,7 @@ class BailingMoeModel(BailingMoePreTrainedModel):
         all_router_logits = () if output_router_logits else None
         next_decoder_cache = None
-        for layer_idx, decoder_layer in enumerate(self.layers):
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
@@ -1332,9 +1332,10 @@ class BailingMoeForCausalLM(BailingMoePreTrainedModel):
                 )
                 logits = F.linear(hidden_states, norm_weight, None)
             else:
-                self.lm_head.weight.data = (self.lm_head.weight.data.float() / (
-                    torch.norm(self.lm_head.weight.data.float(), p=2, dim=0, keepdim=True) + 1e-7
-                )).to(hidden_states.dtype)
                 logits = F.linear(hidden_states, self.lm_head.weight.data, None)
                 self.norm_head = False
         else:
@@ -1380,7 +1381,11 @@ class BailingMoeForCausalLM(BailingMoePreTrainedModel):
             if isinstance(past_key_values, Cache):
                 cache_length = past_key_values.get_seq_length()
                 past_length = past_key_values.seen_tokens
-                max_cache_length = past_key_values.get_max_length()
             else:
                 cache_length = past_length = past_key_values[0][0].shape[2]
                 max_cache_length = None

         hidden_states = hidden_states.to(torch.float32)
         variance = hidden_states.pow(2).mean(-1, keepdim=True)
         hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
 ALL_LAYERNORM_LAYERS.append(BailingMoeRMSNorm)
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
+        attn_weights = torch.matmul(query_states / math.sqrt(self.head_dim), key_states.transpose(2, 3))
         if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
             raise ValueError(
             dropout_p=self.attention_dropout if self.training else 0.0,
             # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
             is_causal=self.is_causal and attention_mask is None and q_len > 1,
         )
         attn_output = attn_output.transpose(1, 2).contiguous()
     def __init__(self, config: BailingMoeConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
         self.attention = BAILING_MOE_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
         self.mlp = (
         all_router_logits = () if output_router_logits else None
         next_decoder_cache = None
+        for decoder_layer in self.layers:
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
                 )
                 logits = F.linear(hidden_states, norm_weight, None)
             else:
+                self.lm_head.weight.data = (
+                    self.lm_head.weight.data.float()
+                    / (torch.norm(self.lm_head.weight.data.float(), p=2, dim=0, keepdim=True) + 1e-7)
+                ).to(hidden_states.dtype)
                 logits = F.linear(hidden_states, self.lm_head.weight.data, None)
                 self.norm_head = False
         else:
             if isinstance(past_key_values, Cache):
                 cache_length = past_key_values.get_seq_length()
                 past_length = past_key_values.seen_tokens
+                max_cache_length = (
+                    past_key_values.get_max_length()
+                    if hasattr(past_key_values, "get_max_length")
+                    else past_key_values.get_max_cache_shape()
+                )
             else:
                 cache_length = past_length = past_key_values[0][0].shape[2]
                 max_cache_length = None