Instructions to use Maxtimer97/GLM2NSA with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use Maxtimer97/GLM2NSA with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="Maxtimer97/GLM2NSA", trust_remote_code=True)
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("Maxtimer97/GLM2NSA", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use Maxtimer97/GLM2NSA with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "Maxtimer97/GLM2NSA"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "Maxtimer97/GLM2NSA",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/Maxtimer97/GLM2NSA

SGLang

How to use Maxtimer97/GLM2NSA with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "Maxtimer97/GLM2NSA" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "Maxtimer97/GLM2NSA",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "Maxtimer97/GLM2NSA" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "Maxtimer97/GLM2NSA",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use Maxtimer97/GLM2NSA with Docker Model Runner:
```
docker model run hf.co/Maxtimer97/GLM2NSA
```

Maxtimer97 commited on Oct 3, 2025

Commit

4ee9d9e

1 Parent(s): 22ba83b

Changed to autotune triton for 48G GPU deployment

Browse files

Files changed (2) hide show

compressed_attention.py +40 -15
topk_sparse_attention.py +38 -15

compressed_attention.py CHANGED Viewed

@@ -26,7 +26,13 @@ except ImportError:
 IS_HOPPER_GPU = is_hopper_gpu()
 @triton.jit
 def forward_kernel(
     q_ptr,  # Q: n x h x d
@@ -159,6 +165,13 @@ def forward_kernel(
     tl.store(l_ptrs, lse_i, mask=off_q < q_len)
 @triton.jit
 def backward_sum_o_do(
     o_ptr,  # O: n x h x d
@@ -194,7 +207,13 @@ def backward_sum_o_do(
     delta = tl.sum(o * do, axis=1)
     tl.store(delta_ptr + pid_h * stride_dh + off_n * stride_dn, delta, mask=off_n < o_len)
 @triton.jit
 def backward_dkdv(
     q_ptr,  # Q: n x qh x d
@@ -368,7 +387,13 @@ def backward_dkdv(
     tl.store(dk_ptrs, dk.to(dk_ptr.dtype.element_ty), boundary_check=(0, 1))
     tl.store(dv_ptrs, dv.to(dv_ptr.dtype.element_ty), boundary_check=(0, 1))
 @triton.jit
 def backward_dq(
     q_ptr,  # Q: n x qh x d
@@ -595,8 +620,8 @@ def _compressed_attention_fwd(
         BLOCK_SIZE_Q=BLOCK_SIZE_Q,
         BLOCK_SIZE_K=BLOCK_SIZE_K,
         BLOCK_SIZE_D=BLOCK_SIZE_D,
-        num_warps=num_warps,
-        num_stages=num_stages,
     )
     return o, lse
@@ -643,8 +668,8 @@ def _compressed_attention_bwd(
         delta.stride(1),
         BLOCK_SIZE_O=BLOCK_SIZE_O,
         BLOCK_SIZE_D=BLOCK_SIZE_D,
-        num_warps=num_warps,
-        num_stages=num_stages,
     )
     # compute dk dv
     dk = torch.zeros(num_share_q_heads, k_len, num_k_heads, head_dim, device=k.device, dtype=k.dtype)
@@ -703,8 +728,8 @@ def _compressed_attention_bwd(
         BLOCK_SIZE_Q=BLOCK_SIZE_Q,
         BLOCK_SIZE_K=BLOCK_SIZE_K,
         BLOCK_SIZE_D=BLOCK_SIZE_D,
-        num_warps=num_warps,
-        num_stages=num_stages,
     )
     dk = dk.sum(0)
     dv = dv.sum(0)
@@ -756,8 +781,8 @@ def _compressed_attention_bwd(
         BLOCK_SIZE_Q=BLOCK_SIZE_Q,
         BLOCK_SIZE_K=BLOCK_SIZE_K,
         BLOCK_SIZE_D=BLOCK_SIZE_D,
-        num_warps=num_warps,
-        num_stages=num_stages,
     )
     return dq, dk, dv
@@ -1000,8 +1025,8 @@ def _get_attention_score(
         BLOCK_SIZE_Q=BLOCK_SIZE_Q,
         BLOCK_SIZE_K=BLOCK_SIZE_K,
         BLOCK_SIZE_D=BLOCK_SIZE_D,
-        num_warps=8,
-        num_stages=3,
     )
     return score
@@ -1155,8 +1180,8 @@ def transform_score(
         BLOCK_SIZE_Q=BLOCK_SIZE_Q,
         BLOCK_SIZE_K=BLOCK_SIZE_K,
         BLOCK_SIZE_O=BLOCK_SIZE_O,
-        num_warps=4,
-        num_stages=3,
     )
     return block_score

 IS_HOPPER_GPU = is_hopper_gpu()
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps)
+        for num_warps in [1, 2, 4, 8]
+    ],
+    key=['HEAD_DIM', 'BLOCK_SIZE_Q', 'BLOCK_SIZE_K', 'BLOCK_SIZE_V'],
+)
 @triton.jit
 def forward_kernel(
     q_ptr,  # Q: n x h x d
     tl.store(l_ptrs, lse_i, mask=off_q < q_len)
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps)
+        for num_warps in [1, 2, 4, 8]
+    ],
+    key=['HEAD_DIM', 'BLOCK_SIZE_O', 'BLOCK_SIZE_D'],
+)
 @triton.jit
 def backward_sum_o_do(
     o_ptr,  # O: n x h x d
     delta = tl.sum(o * do, axis=1)
     tl.store(delta_ptr + pid_h * stride_dh + off_n * stride_dn, delta, mask=off_n < o_len)
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps)
+        for num_warps in [1, 2, 4, 8]
+    ],
+    key=['HEAD_DIM', 'BLOCK_SIZE_Q', 'BLOCK_SIZE_K', 'BLOCK_SIZE_D'],
+)
 @triton.jit
 def backward_dkdv(
     q_ptr,  # Q: n x qh x d
     tl.store(dk_ptrs, dk.to(dk_ptr.dtype.element_ty), boundary_check=(0, 1))
     tl.store(dv_ptrs, dv.to(dv_ptr.dtype.element_ty), boundary_check=(0, 1))
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps)
+        for num_warps in [1, 2, 4, 8]
+    ],
+    key=['HEAD_DIM', 'BLOCK_SIZE_Q', 'BLOCK_SIZE_K', 'BLOCK_SIZE_D'],
+)
 @triton.jit
 def backward_dq(
     q_ptr,  # Q: n x qh x d
         BLOCK_SIZE_Q=BLOCK_SIZE_Q,
         BLOCK_SIZE_K=BLOCK_SIZE_K,
         BLOCK_SIZE_D=BLOCK_SIZE_D,
+        # num_warps=num_warps,
+        # num_stages=num_stages,
     )
     return o, lse
         delta.stride(1),
         BLOCK_SIZE_O=BLOCK_SIZE_O,
         BLOCK_SIZE_D=BLOCK_SIZE_D,
+        # num_warps=num_warps,
+        # num_stages=num_stages,
     )
     # compute dk dv
     dk = torch.zeros(num_share_q_heads, k_len, num_k_heads, head_dim, device=k.device, dtype=k.dtype)
         BLOCK_SIZE_Q=BLOCK_SIZE_Q,
         BLOCK_SIZE_K=BLOCK_SIZE_K,
         BLOCK_SIZE_D=BLOCK_SIZE_D,
+        # num_warps=num_warps,
+        # num_stages=num_stages,
     )
     dk = dk.sum(0)
     dv = dv.sum(0)
         BLOCK_SIZE_Q=BLOCK_SIZE_Q,
         BLOCK_SIZE_K=BLOCK_SIZE_K,
         BLOCK_SIZE_D=BLOCK_SIZE_D,
+        # num_warps=num_warps,
+        # num_stages=num_stages,
     )
     return dq, dk, dv
         BLOCK_SIZE_Q=BLOCK_SIZE_Q,
         BLOCK_SIZE_K=BLOCK_SIZE_K,
         BLOCK_SIZE_D=BLOCK_SIZE_D,
+        # num_warps=8,
+        # num_stages=3,
     )
     return score
         BLOCK_SIZE_Q=BLOCK_SIZE_Q,
         BLOCK_SIZE_K=BLOCK_SIZE_K,
         BLOCK_SIZE_O=BLOCK_SIZE_O,
+        # num_warps=4,
+        # num_stages=3,
     )
     return block_score

topk_sparse_attention.py CHANGED Viewed

@@ -25,7 +25,10 @@ except ImportError:
 IS_HOPPER_GPU = is_hopper_gpu()
 @triton.jit
 def forward_kernel_orig(
     q_ptr,  # Q: n x h x d
@@ -196,7 +199,10 @@ def forward_kernel_orig(
             lse_ptrs = lse_ptr + (q_start + pid_q_j) * stride_ln + (pid_h + off_h) * stride_lh
             tl.store(lse_ptrs, lse_i, mask=off_h < NUM_SHARE_Q_HEADS)
 @triton.jit
 def backward_sum_o_do(
     o_ptr,  # O: n x h x d
@@ -233,6 +239,10 @@ def backward_sum_o_do(
     tl.store(delta_ptr + pid_h * stride_dh + off_o * stride_dn, delta, mask=off_o < o_len)
 @triton.jit
 def count_kernel(
     x_ptr,  # [num_kv_heads, total_len, topk]
@@ -309,12 +319,16 @@ def count_query(
         BLOCK_SIZE_N=BLOCK_SIZE_N,
         BLOCK_SIZE_K=BLOCK_SIZE_K,
         BLOCK_SIZE_R=BLOCK_SIZE_R,
-        num_warps=4,
-        num_stages=3,
     )
     return active_query_count
 @triton.jit
 def pad_topk_idx_kernel(
     t_ptr,
@@ -360,7 +374,10 @@ def pad_topk_idx_kernel(
     idxs = tl.load(t_ptrs, boundary_check=(0, 1))
     tl.store(p_ptrs, idxs, boundary_check=(0, 1))
 @triton.jit
 def save_topk_idx_kernel(
     p_ptr,
@@ -478,7 +495,10 @@ def reorder_topk_idx(
     )
     return topk_q_idx
 @triton.jit
 def backward_dkdv(
     q_ptr,  # Q: n x qh x d
@@ -646,7 +666,10 @@ def backward_dkdv(
     tl.store(dk_ptrs, dk.to(dk_ptr.dtype.element_ty), boundary_check=(0, 1))
     tl.store(dv_ptrs, dv.to(dv_ptr.dtype.element_ty), boundary_check=(0, 1))
 @triton.jit
 def backward_dq(
     q_ptr,  # Q: n x qh x d
@@ -902,8 +925,8 @@ def _topk_sparse_attention_fwd(
         BLOCK_SIZE_D=BLOCK_SIZE_D,
         BLOCK_SIZE_H=BLOCK_SIZE_H,
         BLOCK_SIZE_T=BLOCK_SIZE_T,
-        num_warps=num_warps,
-        num_stages=num_stages,
     )
     return o, lse
@@ -954,8 +977,8 @@ def _topk_sparse_attention_bwd(
         delta.stride(1),
         BLOCK_SIZE_O=BLOCK_SIZE_O,
         BLOCK_SIZE_D=BLOCK_SIZE_D,
-        num_warps=num_warps,
-        num_stages=num_stages,
     )
     # count active querys for each key block, shape: (num_k_heads, total_k_blocks)
     seqlens = cu_seqlens_q[1:] - cu_seqlens_q[:-1]
@@ -1038,8 +1061,8 @@ def _topk_sparse_attention_bwd(
         BLOCK_SIZE_Q=BLOCK_SIZE_Q,
         BLOCK_SIZE_K=BLOCK_SIZE_K,
         BLOCK_SIZE_D=BLOCK_SIZE_D,
-        num_warps=num_warps,
-        num_stages=num_stages,
     )
     dk = dk.sum(0)
     dv = dv.sum(0)
@@ -1096,8 +1119,8 @@ def _topk_sparse_attention_bwd(
         BLOCK_SIZE_D=BLOCK_SIZE_D,
         BLOCK_SIZE_H=BLOCK_SIZE_H,
         BLOCK_SIZE_T=BLOCK_SIZE_T,
-        num_warps=num_warps,
-        num_stages=num_stages,
     )
     return dq, dk, dv

 IS_HOPPER_GPU = is_hopper_gpu()
+@triton.autotune(
+    configs=[triton.Config({}, num_warps=nw) for nw in [1, 2, 4, 8]],
+    key=['HEAD_DIM', 'BLOCK_SIZE_K', 'BLOCK_SIZE_D', 'BLOCK_SIZE_H', 'BLOCK_SIZE_T'],
+)
 @triton.jit
 def forward_kernel_orig(
     q_ptr,  # Q: n x h x d
             lse_ptrs = lse_ptr + (q_start + pid_q_j) * stride_ln + (pid_h + off_h) * stride_lh
             tl.store(lse_ptrs, lse_i, mask=off_h < NUM_SHARE_Q_HEADS)
+@triton.autotune(
+    configs=[triton.Config({}, num_warps=nw) for nw in [1, 2, 4, 8]],
+    key=['HEAD_DIM', 'BLOCK_SIZE_O', 'BLOCK_SIZE_D'],
+)
 @triton.jit
 def backward_sum_o_do(
     o_ptr,  # O: n x h x d
     tl.store(delta_ptr + pid_h * stride_dh + off_o * stride_dn, delta, mask=off_o < o_len)
+@triton.autotune(
+    configs=[triton.Config({}, num_warps=nw) for nw in [1, 2, 4, 8]],
+    key=['BLOCK_SIZE_N', 'BLOCK_SIZE_K', 'BLOCK_SIZE_R'],
+)
 @triton.jit
 def count_kernel(
     x_ptr,  # [num_kv_heads, total_len, topk]
         BLOCK_SIZE_N=BLOCK_SIZE_N,
         BLOCK_SIZE_K=BLOCK_SIZE_K,
         BLOCK_SIZE_R=BLOCK_SIZE_R,
+        # num_warps=4,
+        # num_stages=3,
     )
     return active_query_count
+@triton.autotune(
+    configs=[triton.Config({}, num_warps=nw) for nw in [1, 2, 4, 8]],
+    key=['topk', 'BLOCK_SIZE_N', 'BLOCK_SIZE_T'],
+)
 @triton.jit
 def pad_topk_idx_kernel(
     t_ptr,
     idxs = tl.load(t_ptrs, boundary_check=(0, 1))
     tl.store(p_ptrs, idxs, boundary_check=(0, 1))
+@triton.autotune(
+    configs=[triton.Config({}, num_warps=nw) for nw in [1, 2, 4, 8]],
+    key=['BLOCK_SIZE_N'],
+)
 @triton.jit
 def save_topk_idx_kernel(
     p_ptr,
     )
     return topk_q_idx
+@triton.autotune(
+    configs=[triton.Config({}, num_warps=nw) for nw in [1, 2, 4, 8]],
+    key=['HEAD_DIM', 'BLOCK_SIZE_Q', 'BLOCK_SIZE_K', 'BLOCK_SIZE_D'],
+)
 @triton.jit
 def backward_dkdv(
     q_ptr,  # Q: n x qh x d
     tl.store(dk_ptrs, dk.to(dk_ptr.dtype.element_ty), boundary_check=(0, 1))
     tl.store(dv_ptrs, dv.to(dv_ptr.dtype.element_ty), boundary_check=(0, 1))
+@triton.autotune(
+    configs=[triton.Config({}, num_warps=nw) for nw in [1, 2, 4, 8]],
+    key=['HEAD_DIM', 'BLOCK_SIZE_K', 'BLOCK_SIZE_D', 'BLOCK_SIZE_H', 'BLOCK_SIZE_T'],
+)
 @triton.jit
 def backward_dq(
     q_ptr,  # Q: n x qh x d
         BLOCK_SIZE_D=BLOCK_SIZE_D,
         BLOCK_SIZE_H=BLOCK_SIZE_H,
         BLOCK_SIZE_T=BLOCK_SIZE_T,
+        # num_warps=num_warps,
+        # num_stages=num_stages,
     )
     return o, lse
         delta.stride(1),
         BLOCK_SIZE_O=BLOCK_SIZE_O,
         BLOCK_SIZE_D=BLOCK_SIZE_D,
+        # num_warps=num_warps,
+        # num_stages=num_stages,
     )
     # count active querys for each key block, shape: (num_k_heads, total_k_blocks)
     seqlens = cu_seqlens_q[1:] - cu_seqlens_q[:-1]
         BLOCK_SIZE_Q=BLOCK_SIZE_Q,
         BLOCK_SIZE_K=BLOCK_SIZE_K,
         BLOCK_SIZE_D=BLOCK_SIZE_D,
+        # num_warps=num_warps,
+        # num_stages=num_stages,
     )
     dk = dk.sum(0)
     dv = dv.sum(0)
         BLOCK_SIZE_D=BLOCK_SIZE_D,
         BLOCK_SIZE_H=BLOCK_SIZE_H,
         BLOCK_SIZE_T=BLOCK_SIZE_T,
+        # num_warps=num_warps,
+        # num_stages=num_stages,
     )
     return dq, dk, dv