Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .venv/lib/python3.11/site-packages/vllm/attention/layer.py +364 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/__pycache__/activation.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/__pycache__/layernorm.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/__pycache__/linear.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/__pycache__/pooler.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/__pycache__/vocab_parallel_embedding.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/mamba/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/mamba/__pycache__/mamba_mixer.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/mamba/mamba_mixer.py +243 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/mamba/ops/__init__.py +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/mamba/ops/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/mamba/ops/__pycache__/causal_conv1d.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/mamba/ops/__pycache__/mamba_ssm.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/mamba/ops/causal_conv1d.py +104 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/mamba/ops/mamba_ssm.py +413 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/__pycache__/fp8_utils.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/__pycache__/machete_utils.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils_fp8.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils_test.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils_test_24.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils_test_qqq.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/__pycache__/quant_utils.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/__pycache__/w8a8_utils.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- .venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json +146 -0
.venv/lib/python3.11/site-packages/vllm/attention/layer.py
ADDED
|
@@ -0,0 +1,364 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
"""Attention layer."""
|
| 3 |
+
from typing import Any, Dict, List, Optional
|
| 4 |
+
|
| 5 |
+
import torch
|
| 6 |
+
import torch.nn as nn
|
| 7 |
+
import torch.nn.functional as F
|
| 8 |
+
|
| 9 |
+
import vllm.envs as envs
|
| 10 |
+
from vllm.attention import AttentionMetadata, AttentionType
|
| 11 |
+
from vllm.attention.selector import backend_name_to_enum, get_attn_backend
|
| 12 |
+
from vllm.config import CacheConfig, get_current_vllm_config
|
| 13 |
+
from vllm.forward_context import ForwardContext, get_forward_context
|
| 14 |
+
from vllm.model_executor.layers.quantization.base_config import (
|
| 15 |
+
QuantizationConfig)
|
| 16 |
+
from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
|
| 17 |
+
from vllm.platforms import _Backend, current_platform
|
| 18 |
+
from vllm.utils import direct_register_custom_op
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class Attention(nn.Module):
|
| 22 |
+
"""Attention layer.
|
| 23 |
+
|
| 24 |
+
This class takes query, key, and value tensors as input. The input tensors
|
| 25 |
+
can either contain prompt tokens or generation tokens.
|
| 26 |
+
The class does the following:
|
| 27 |
+
|
| 28 |
+
1. Store the input key and value tensors in the KV cache.
|
| 29 |
+
2. Perform (multi-head/multi-query/grouped-query) attention.
|
| 30 |
+
3. Return the output tensor.
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
def __init__(
|
| 34 |
+
self,
|
| 35 |
+
num_heads: int,
|
| 36 |
+
head_size: int,
|
| 37 |
+
scale: float,
|
| 38 |
+
num_kv_heads: Optional[int] = None,
|
| 39 |
+
alibi_slopes: Optional[List[float]] = None,
|
| 40 |
+
cache_config: Optional[CacheConfig] = None,
|
| 41 |
+
quant_config: Optional[QuantizationConfig] = None,
|
| 42 |
+
blocksparse_params: Optional[Dict[str, Any]] = None,
|
| 43 |
+
logits_soft_cap: Optional[float] = None,
|
| 44 |
+
per_layer_sliding_window: Optional[int] = None,
|
| 45 |
+
use_mla: bool = False,
|
| 46 |
+
prefix: str = "",
|
| 47 |
+
attn_type: str = AttentionType.DECODER,
|
| 48 |
+
**extra_impl_args,
|
| 49 |
+
) -> None:
|
| 50 |
+
super().__init__()
|
| 51 |
+
if per_layer_sliding_window is not None:
|
| 52 |
+
# per-layer sliding window
|
| 53 |
+
sliding_window = per_layer_sliding_window
|
| 54 |
+
elif cache_config is not None:
|
| 55 |
+
# model-level sliding window
|
| 56 |
+
sliding_window = cache_config.sliding_window
|
| 57 |
+
else:
|
| 58 |
+
sliding_window = None
|
| 59 |
+
|
| 60 |
+
if cache_config is not None:
|
| 61 |
+
kv_cache_dtype = cache_config.cache_dtype
|
| 62 |
+
block_size = cache_config.block_size
|
| 63 |
+
is_attention_free = cache_config.is_attention_free
|
| 64 |
+
calculate_kv_scales = cache_config.calculate_kv_scales
|
| 65 |
+
else:
|
| 66 |
+
kv_cache_dtype = "auto"
|
| 67 |
+
block_size = 16
|
| 68 |
+
is_attention_free = False
|
| 69 |
+
calculate_kv_scales = False
|
| 70 |
+
if num_kv_heads is None:
|
| 71 |
+
num_kv_heads = num_heads
|
| 72 |
+
|
| 73 |
+
# The default k/v_scale is set to 1.0. This is ignored
|
| 74 |
+
# when kv-cache is not fp8, and should be used with
|
| 75 |
+
# kv-cache in fp8_e5m2. For kv-cache in fp8_e4m3, we
|
| 76 |
+
# expect the pre-quantized k/v_scale to be loaded along
|
| 77 |
+
# with the model weights.
|
| 78 |
+
self.kv_cache_dtype = kv_cache_dtype
|
| 79 |
+
self.calculate_kv_scales = calculate_kv_scales
|
| 80 |
+
self._k_scale = torch.tensor(1.0, dtype=torch.float32)
|
| 81 |
+
self._v_scale = torch.tensor(1.0, dtype=torch.float32)
|
| 82 |
+
|
| 83 |
+
# We also keep the float32 versions of k/v_scale for attention
|
| 84 |
+
# backends that don't support tensors (Flashinfer)
|
| 85 |
+
self._k_scale_float = 1.0
|
| 86 |
+
self._v_scale_float = 1.0
|
| 87 |
+
|
| 88 |
+
quant_method = quant_config.get_quant_method(
|
| 89 |
+
self, prefix=prefix) if quant_config else None
|
| 90 |
+
if quant_method is not None:
|
| 91 |
+
assert isinstance(quant_method, BaseKVCacheMethod)
|
| 92 |
+
# TODO (mgoin): kv cache dtype should be specified in the FP8
|
| 93 |
+
# checkpoint config and become the "auto" behavior
|
| 94 |
+
if self.kv_cache_dtype == "fp8_e5m2":
|
| 95 |
+
raise ValueError("fp8_e5m2 kv-cache is not supported with "
|
| 96 |
+
"fp8 checkpoints.")
|
| 97 |
+
# If quantization is enabled, we make "k_scale" and "v_scale"
|
| 98 |
+
# parameters so that it can be loaded from the model checkpoint.
|
| 99 |
+
# The k/v_scale will then be converted back to native float32
|
| 100 |
+
# values after weight loading.
|
| 101 |
+
self.quant_method = quant_method
|
| 102 |
+
self.quant_method.create_weights(self)
|
| 103 |
+
|
| 104 |
+
# During model initialization, the default dtype is set as the model
|
| 105 |
+
# weight and activation dtype.
|
| 106 |
+
dtype = torch.get_default_dtype()
|
| 107 |
+
attn_backend = get_attn_backend(head_size,
|
| 108 |
+
dtype,
|
| 109 |
+
kv_cache_dtype,
|
| 110 |
+
block_size,
|
| 111 |
+
is_attention_free,
|
| 112 |
+
blocksparse_params is not None,
|
| 113 |
+
use_mla=use_mla)
|
| 114 |
+
impl_cls = attn_backend.get_impl_cls()
|
| 115 |
+
self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads,
|
| 116 |
+
alibi_slopes, sliding_window, kv_cache_dtype,
|
| 117 |
+
blocksparse_params, logits_soft_cap, attn_type,
|
| 118 |
+
**extra_impl_args)
|
| 119 |
+
self.num_heads = num_heads
|
| 120 |
+
self.head_size = head_size
|
| 121 |
+
self.num_kv_heads = num_kv_heads
|
| 122 |
+
self.sliding_window = sliding_window
|
| 123 |
+
self.backend = backend_name_to_enum(attn_backend.get_name())
|
| 124 |
+
self.dtype = dtype
|
| 125 |
+
|
| 126 |
+
# For cuda-alike (CUDA and ROCM) and cpu platforms, we control how
|
| 127 |
+
# torch.compile works by registering the attention as one giant
|
| 128 |
+
# opaque custom op. For other platforms, we directly call them
|
| 129 |
+
# and let torch.compile handle them.
|
| 130 |
+
self.use_direct_call = not current_platform.is_cuda_alike(
|
| 131 |
+
) and not current_platform.is_cpu()
|
| 132 |
+
|
| 133 |
+
self.use_output = attn_backend.accept_output_buffer
|
| 134 |
+
compilation_config = get_current_vllm_config().compilation_config
|
| 135 |
+
if prefix in compilation_config.static_forward_context:
|
| 136 |
+
raise ValueError(f"Duplicate layer name: {prefix}")
|
| 137 |
+
compilation_config.static_forward_context[prefix] = self
|
| 138 |
+
self.layer_name = prefix
|
| 139 |
+
self.attn_type = attn_type
|
| 140 |
+
# use a placeholder kv cache tensor during init, which will be replaced
|
| 141 |
+
# by bind_kv_cache
|
| 142 |
+
# this variable will not be accessed if use_direct_call is True
|
| 143 |
+
self.kv_cache = [
|
| 144 |
+
torch.tensor([]) for _ in range(get_current_vllm_config(
|
| 145 |
+
).parallel_config.pipeline_parallel_size)
|
| 146 |
+
]
|
| 147 |
+
|
| 148 |
+
self.k_range = torch.tensor(envs.K_SCALE_CONSTANT, dtype=torch.float32)
|
| 149 |
+
self.v_range = torch.tensor(envs.V_SCALE_CONSTANT, dtype=torch.float32)
|
| 150 |
+
|
| 151 |
+
def forward(
|
| 152 |
+
self,
|
| 153 |
+
query: torch.Tensor,
|
| 154 |
+
key: torch.Tensor,
|
| 155 |
+
value: torch.Tensor,
|
| 156 |
+
kv_cache: torch.Tensor,
|
| 157 |
+
attn_metadata: AttentionMetadata,
|
| 158 |
+
) -> torch.Tensor:
|
| 159 |
+
# NOTE: please avoid accessing `kv_cache` and `attn_metadata` arguments
|
| 160 |
+
# directly, use `self.kv_cache` and
|
| 161 |
+
# `get_forward_context().attn_metadata` instead.
|
| 162 |
+
if self.calculate_kv_scales:
|
| 163 |
+
ctx_attn_metadata = get_forward_context().attn_metadata
|
| 164 |
+
if ctx_attn_metadata.enable_kv_scales_calculation:
|
| 165 |
+
self.calc_kv_scales(key, value)
|
| 166 |
+
if self.use_output:
|
| 167 |
+
output = torch.empty_like(query)
|
| 168 |
+
hidden_size = query.size(-1)
|
| 169 |
+
# Reshape the query, key, and value tensors.
|
| 170 |
+
# NOTE(woosuk): We do this outside the custom op to minimize the
|
| 171 |
+
# CPU overheads from the non-CUDA-graph regions.
|
| 172 |
+
query = query.view(-1, self.num_heads, self.head_size)
|
| 173 |
+
output = output.view(-1, self.num_heads, self.head_size)
|
| 174 |
+
if key is not None:
|
| 175 |
+
key = key.view(-1, self.num_kv_heads, self.head_size)
|
| 176 |
+
if value is not None:
|
| 177 |
+
value = value.view(-1, self.num_kv_heads, self.head_size)
|
| 178 |
+
if self.use_direct_call:
|
| 179 |
+
forward_context: ForwardContext = get_forward_context()
|
| 180 |
+
ctx_attn_metadata = forward_context.attn_metadata
|
| 181 |
+
self_kv_cache = self.kv_cache[forward_context.virtual_engine]
|
| 182 |
+
self.impl.forward(self,
|
| 183 |
+
query,
|
| 184 |
+
key,
|
| 185 |
+
value,
|
| 186 |
+
self_kv_cache,
|
| 187 |
+
ctx_attn_metadata,
|
| 188 |
+
output=output)
|
| 189 |
+
else:
|
| 190 |
+
torch.ops.vllm.unified_attention_with_output(
|
| 191 |
+
query, key, value, output, self.layer_name)
|
| 192 |
+
return output.view(-1, hidden_size)
|
| 193 |
+
else:
|
| 194 |
+
if self.use_direct_call:
|
| 195 |
+
forward_context = get_forward_context()
|
| 196 |
+
ctx_attn_metadata = forward_context.attn_metadata
|
| 197 |
+
self_kv_cache = self.kv_cache[forward_context.virtual_engine]
|
| 198 |
+
return self.impl.forward(self, query, key, value,
|
| 199 |
+
self_kv_cache, ctx_attn_metadata)
|
| 200 |
+
else:
|
| 201 |
+
return torch.ops.vllm.unified_attention(
|
| 202 |
+
query, key, value, self.layer_name)
|
| 203 |
+
|
| 204 |
+
def calc_kv_scales(self, key, value):
|
| 205 |
+
self._k_scale.copy_(torch.abs(key).max() / self.k_range)
|
| 206 |
+
self._v_scale.copy_(torch.abs(value).max() / self.v_range)
|
| 207 |
+
self._k_scale_float = self._k_scale.item()
|
| 208 |
+
self._v_scale_float = self._v_scale.item()
|
| 209 |
+
# We only calculate the scales once
|
| 210 |
+
self.calculate_kv_scales = False
|
| 211 |
+
|
| 212 |
+
def extra_repr(self) -> str:
|
| 213 |
+
s = f"head_size={self.impl.head_size}" # type: ignore
|
| 214 |
+
s += f", num_heads={self.impl.num_heads}" # type: ignore
|
| 215 |
+
s += f", num_kv_heads={self.impl.num_kv_heads}" # type: ignore
|
| 216 |
+
s += f", scale={self.impl.scale}" # type: ignore
|
| 217 |
+
s += f", backend={self.impl.__class__.__name__}"
|
| 218 |
+
return s
|
| 219 |
+
|
| 220 |
+
def process_weights_after_loading(self, act_dtype: torch.dtype):
|
| 221 |
+
if hasattr(self.impl, "process_weights_after_loading"):
|
| 222 |
+
self.impl.process_weights_after_loading(act_dtype)
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
class MultiHeadAttention(nn.Module):
|
| 226 |
+
"""Multi-headed attention without any cache, used for ViT."""
|
| 227 |
+
|
| 228 |
+
def __init__(
|
| 229 |
+
self,
|
| 230 |
+
num_heads: int,
|
| 231 |
+
head_size: int,
|
| 232 |
+
scale: float,
|
| 233 |
+
num_kv_heads: Optional[int] = None,
|
| 234 |
+
):
|
| 235 |
+
super().__init__()
|
| 236 |
+
self.num_heads = num_heads
|
| 237 |
+
self.head_size = head_size
|
| 238 |
+
self.scale = scale
|
| 239 |
+
self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
|
| 240 |
+
|
| 241 |
+
assert self.num_heads % self.num_kv_heads == 0
|
| 242 |
+
self.num_queries_per_kv = self.num_heads // self.num_kv_heads
|
| 243 |
+
|
| 244 |
+
dtype = torch.get_default_dtype()
|
| 245 |
+
attn_backend = get_attn_backend(head_size,
|
| 246 |
+
dtype,
|
| 247 |
+
kv_cache_dtype=None,
|
| 248 |
+
block_size=16,
|
| 249 |
+
is_attention_free=False)
|
| 250 |
+
backend = backend_name_to_enum(attn_backend.get_name())
|
| 251 |
+
if backend in {_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1}:
|
| 252 |
+
backend = _Backend.XFORMERS
|
| 253 |
+
|
| 254 |
+
self.attn_backend = backend if backend in {
|
| 255 |
+
_Backend.TORCH_SDPA,
|
| 256 |
+
_Backend.XFORMERS,
|
| 257 |
+
} else _Backend.TORCH_SDPA
|
| 258 |
+
|
| 259 |
+
def forward(
|
| 260 |
+
self,
|
| 261 |
+
query: torch.Tensor,
|
| 262 |
+
key: torch.Tensor,
|
| 263 |
+
value: torch.Tensor,
|
| 264 |
+
) -> torch.Tensor:
|
| 265 |
+
"""Input shape: batch_size x seq_len x hidden_size"""
|
| 266 |
+
# TODO(Isotr0py): Use existing backend implementations and support FA3
|
| 267 |
+
bsz, q_len, _ = query.size()
|
| 268 |
+
kv_len = key.size(1)
|
| 269 |
+
|
| 270 |
+
query = query.view(bsz, q_len, self.num_heads, self.head_size)
|
| 271 |
+
key = key.view(bsz, kv_len, self.num_kv_heads, self.head_size)
|
| 272 |
+
value = value.view(bsz, kv_len, self.num_kv_heads, self.head_size)
|
| 273 |
+
|
| 274 |
+
if (num_repeat := self.num_queries_per_kv) > 1:
|
| 275 |
+
# Handle MQA and GQA
|
| 276 |
+
key = torch.repeat_interleave(key, num_repeat, dim=2)
|
| 277 |
+
value = torch.repeat_interleave(value, num_repeat, dim=2)
|
| 278 |
+
|
| 279 |
+
if self.attn_backend == _Backend.XFORMERS:
|
| 280 |
+
from xformers import ops as xops
|
| 281 |
+
|
| 282 |
+
out = xops.memory_efficient_attention_forward(query,
|
| 283 |
+
key,
|
| 284 |
+
value,
|
| 285 |
+
scale=self.scale)
|
| 286 |
+
elif self.attn_backend == _Backend.TORCH_SDPA:
|
| 287 |
+
query, key, value = (x.transpose(1, 2)
|
| 288 |
+
for x in (query, key, value))
|
| 289 |
+
out = F.scaled_dot_product_attention(query,
|
| 290 |
+
key,
|
| 291 |
+
value,
|
| 292 |
+
scale=self.scale)
|
| 293 |
+
out = out.transpose(1, 2)
|
| 294 |
+
return out.reshape(bsz, q_len, -1)
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
def unified_attention(
|
| 298 |
+
query: torch.Tensor,
|
| 299 |
+
key: torch.Tensor,
|
| 300 |
+
value: torch.Tensor,
|
| 301 |
+
layer_name: str,
|
| 302 |
+
) -> torch.Tensor:
|
| 303 |
+
forward_context: ForwardContext = get_forward_context()
|
| 304 |
+
attn_metadata = forward_context.attn_metadata
|
| 305 |
+
self = forward_context.attn_layers[layer_name]
|
| 306 |
+
kv_cache = self.kv_cache[forward_context.virtual_engine]
|
| 307 |
+
return self.impl.forward(self, query, key, value, kv_cache, attn_metadata)
|
| 308 |
+
|
| 309 |
+
|
| 310 |
+
def unified_attention_fake(
|
| 311 |
+
query: torch.Tensor,
|
| 312 |
+
key: torch.Tensor,
|
| 313 |
+
value: torch.Tensor,
|
| 314 |
+
layer_name: str,
|
| 315 |
+
) -> torch.Tensor:
|
| 316 |
+
return torch.empty_like(query).contiguous()
|
| 317 |
+
|
| 318 |
+
|
| 319 |
+
direct_register_custom_op(
|
| 320 |
+
op_name="unified_attention",
|
| 321 |
+
op_func=unified_attention,
|
| 322 |
+
mutates_args=[],
|
| 323 |
+
fake_impl=unified_attention_fake,
|
| 324 |
+
dispatch_key=current_platform.dispatch_key,
|
| 325 |
+
)
|
| 326 |
+
|
| 327 |
+
|
| 328 |
+
def unified_attention_with_output(
|
| 329 |
+
query: torch.Tensor,
|
| 330 |
+
key: torch.Tensor,
|
| 331 |
+
value: torch.Tensor,
|
| 332 |
+
output: torch.Tensor,
|
| 333 |
+
layer_name: str,
|
| 334 |
+
) -> None:
|
| 335 |
+
forward_context: ForwardContext = get_forward_context()
|
| 336 |
+
attn_metadata = forward_context.attn_metadata
|
| 337 |
+
self = forward_context.attn_layers[layer_name]
|
| 338 |
+
kv_cache = self.kv_cache[forward_context.virtual_engine]
|
| 339 |
+
self.impl.forward(self,
|
| 340 |
+
query,
|
| 341 |
+
key,
|
| 342 |
+
value,
|
| 343 |
+
kv_cache,
|
| 344 |
+
attn_metadata,
|
| 345 |
+
output=output)
|
| 346 |
+
|
| 347 |
+
|
| 348 |
+
def unified_attention_with_output_fake(
|
| 349 |
+
query: torch.Tensor,
|
| 350 |
+
key: torch.Tensor,
|
| 351 |
+
value: torch.Tensor,
|
| 352 |
+
output: torch.Tensor,
|
| 353 |
+
layer_name: str,
|
| 354 |
+
) -> None:
|
| 355 |
+
return
|
| 356 |
+
|
| 357 |
+
|
| 358 |
+
direct_register_custom_op(
|
| 359 |
+
op_name="unified_attention_with_output",
|
| 360 |
+
op_func=unified_attention_with_output,
|
| 361 |
+
mutates_args=["output"],
|
| 362 |
+
fake_impl=unified_attention_with_output_fake,
|
| 363 |
+
dispatch_key=current_platform.dispatch_key,
|
| 364 |
+
)
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/__pycache__/activation.cpython-311.pyc
ADDED
|
Binary file (22.5 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/__pycache__/layernorm.cpython-311.pyc
ADDED
|
Binary file (9.97 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/__pycache__/linear.cpython-311.pyc
ADDED
|
Binary file (50.5 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/__pycache__/pooler.cpython-311.pyc
ADDED
|
Binary file (16.3 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/__pycache__/vocab_parallel_embedding.cpython-311.pyc
ADDED
|
Binary file (24.4 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/mamba/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (205 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/mamba/__pycache__/mamba_mixer.cpython-311.pyc
ADDED
|
Binary file (10.5 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/mamba/mamba_mixer.py
ADDED
|
@@ -0,0 +1,243 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
import torch
|
| 4 |
+
from torch import nn
|
| 5 |
+
from torch.nn.parameter import Parameter
|
| 6 |
+
|
| 7 |
+
from vllm.attention.backends.abstract import AttentionMetadata
|
| 8 |
+
from vllm.distributed.parallel_state import (
|
| 9 |
+
get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
|
| 10 |
+
from vllm.model_executor.custom_op import CustomOp
|
| 11 |
+
from vllm.model_executor.layers.layernorm import RMSNorm
|
| 12 |
+
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
| 13 |
+
MergedColumnParallelLinear,
|
| 14 |
+
RowParallelLinear)
|
| 15 |
+
from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
|
| 16 |
+
causal_conv1d_fn, causal_conv1d_update)
|
| 17 |
+
from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
|
| 18 |
+
selective_scan_fn, selective_state_update)
|
| 19 |
+
from vllm.model_executor.models.mamba_cache import MambaCacheParams
|
| 20 |
+
from vllm.model_executor.utils import set_weight_attrs
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
# Adapted from transformers.models.mamba.modeling_mamba.MambaMixer
|
| 24 |
+
@CustomOp.register("mamba_mixer")
|
| 25 |
+
class MambaMixer(CustomOp):
|
| 26 |
+
"""
|
| 27 |
+
Compute ∆, A, B, C, and D the state space parameters and compute
|
| 28 |
+
the `contextualized_states`. A, D are input independent
|
| 29 |
+
(see Mamba paper [1] Section 3.5.2 "Interpretation of A"
|
| 30 |
+
for why A isn't selective) ∆, B, C are input-dependent
|
| 31 |
+
(this is a key difference between Mamba and the linear time
|
| 32 |
+
invariant S4, and is why Mamba is called
|
| 33 |
+
**selective** state spaces)
|
| 34 |
+
"""
|
| 35 |
+
|
| 36 |
+
def __init__(self,
|
| 37 |
+
hidden_size: int,
|
| 38 |
+
ssm_state_size: int,
|
| 39 |
+
conv_kernel_size: int,
|
| 40 |
+
intermediate_size: int,
|
| 41 |
+
time_step_rank: int,
|
| 42 |
+
use_conv_bias: bool,
|
| 43 |
+
use_bias: bool,
|
| 44 |
+
use_rms_norm: bool,
|
| 45 |
+
rms_norm_has_weight: bool = True,
|
| 46 |
+
rms_norm_eps: float = 1e-5,
|
| 47 |
+
activation="silu",
|
| 48 |
+
is_lora_enabled: bool = False):
|
| 49 |
+
super().__init__()
|
| 50 |
+
self.time_step_rank = time_step_rank
|
| 51 |
+
self.ssm_state_size = ssm_state_size
|
| 52 |
+
self.use_rms_norm = use_rms_norm
|
| 53 |
+
self.activation = activation
|
| 54 |
+
self.is_lora_enabled = is_lora_enabled
|
| 55 |
+
|
| 56 |
+
self.conv1d = ColumnParallelLinear(
|
| 57 |
+
input_size=conv_kernel_size,
|
| 58 |
+
output_size=intermediate_size,
|
| 59 |
+
bias=use_conv_bias,
|
| 60 |
+
)
|
| 61 |
+
# unsqueeze to fit conv1d weights shape into the linear weights shape.
|
| 62 |
+
# Can't do this in `weight_loader` since it already exists in
|
| 63 |
+
# `ColumnParallelLinear` and `set_weight_attrs`
|
| 64 |
+
# doesn't allow to override it
|
| 65 |
+
self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1)
|
| 66 |
+
|
| 67 |
+
self.in_proj = MergedColumnParallelLinear(hidden_size,
|
| 68 |
+
[intermediate_size] * 2,
|
| 69 |
+
bias=use_bias)
|
| 70 |
+
|
| 71 |
+
# selective projection used to make dt, B and C input dependent
|
| 72 |
+
self.x_proj = RowParallelLinear(
|
| 73 |
+
intermediate_size,
|
| 74 |
+
time_step_rank + ssm_state_size * 2,
|
| 75 |
+
bias=False,
|
| 76 |
+
)
|
| 77 |
+
# time step projection (discretization) -
|
| 78 |
+
# In the forward we need to apply dt_proj without the bias,
|
| 79 |
+
# as the bias is added in the selective scan kernel.
|
| 80 |
+
self.dt_proj = ColumnParallelLinear(time_step_rank,
|
| 81 |
+
intermediate_size,
|
| 82 |
+
bias=True,
|
| 83 |
+
skip_bias_add=True)
|
| 84 |
+
|
| 85 |
+
def weight_loader(param: Parameter, loaded_weight: torch.Tensor):
|
| 86 |
+
tp_rank = get_tensor_model_parallel_rank()
|
| 87 |
+
tp_size = get_tensor_model_parallel_world_size()
|
| 88 |
+
param.data.copy_(
|
| 89 |
+
loaded_weight.data.split(loaded_weight.shape[0] // tp_size,
|
| 90 |
+
dim=0)[tp_rank])
|
| 91 |
+
|
| 92 |
+
def A_weight_loader(param: Parameter, loaded_weight: torch.Tensor):
|
| 93 |
+
weight_loader(param, -torch.exp(loaded_weight.float()))
|
| 94 |
+
|
| 95 |
+
tp_size = get_tensor_model_parallel_world_size()
|
| 96 |
+
self.A = nn.Parameter(
|
| 97 |
+
torch.empty(
|
| 98 |
+
intermediate_size // tp_size,
|
| 99 |
+
ssm_state_size,
|
| 100 |
+
dtype=torch.float32,
|
| 101 |
+
))
|
| 102 |
+
self.D = nn.Parameter(torch.ones(intermediate_size // tp_size))
|
| 103 |
+
|
| 104 |
+
set_weight_attrs(self.D, {"weight_loader": weight_loader})
|
| 105 |
+
set_weight_attrs(self.A, {"weight_loader": A_weight_loader})
|
| 106 |
+
|
| 107 |
+
self.out_proj = RowParallelLinear(
|
| 108 |
+
intermediate_size,
|
| 109 |
+
hidden_size,
|
| 110 |
+
bias=use_bias,
|
| 111 |
+
input_is_parallel=True,
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
self.dt_layernorm = RMSNorm(
|
| 115 |
+
time_step_rank,
|
| 116 |
+
eps=rms_norm_eps,
|
| 117 |
+
has_weight=rms_norm_has_weight,
|
| 118 |
+
) if use_rms_norm else None
|
| 119 |
+
|
| 120 |
+
self.b_layernorm = RMSNorm(
|
| 121 |
+
ssm_state_size,
|
| 122 |
+
eps=rms_norm_eps,
|
| 123 |
+
has_weight=rms_norm_has_weight,
|
| 124 |
+
) if use_rms_norm else None
|
| 125 |
+
|
| 126 |
+
self.c_layernorm = RMSNorm(
|
| 127 |
+
ssm_state_size,
|
| 128 |
+
eps=rms_norm_eps,
|
| 129 |
+
has_weight=rms_norm_has_weight,
|
| 130 |
+
) if use_rms_norm else None
|
| 131 |
+
|
| 132 |
+
def forward_native(self, hidden_states: torch.Tensor,
|
| 133 |
+
attn_metadata: AttentionMetadata,
|
| 134 |
+
conv_state: torch.Tensor, ssm_state: torch.Tensor):
|
| 135 |
+
pass
|
| 136 |
+
|
| 137 |
+
def forward_cuda(self, hidden_states: torch.Tensor,
|
| 138 |
+
attn_metadata: AttentionMetadata,
|
| 139 |
+
mamba_cache_params: MambaCacheParams):
|
| 140 |
+
|
| 141 |
+
# 1. Gated MLP's linear projection
|
| 142 |
+
projected_states = self.in_proj(hidden_states)[0].transpose(-2, -1)
|
| 143 |
+
hidden_states, gate = projected_states.chunk(2, dim=-2)
|
| 144 |
+
|
| 145 |
+
# 2. Convolution sequence transformation
|
| 146 |
+
conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0),
|
| 147 |
+
self.conv1d.weight.size(2))
|
| 148 |
+
|
| 149 |
+
if attn_metadata.query_start_loc is not None \
|
| 150 |
+
and attn_metadata.context_lens_tensor is not None:
|
| 151 |
+
# |---------- N-1 iteration --------|
|
| 152 |
+
# |---------------- N iteration ---------------------|
|
| 153 |
+
# |- tokenA -|......................|-- newTokens ---|
|
| 154 |
+
# |---------- context_len ----------|
|
| 155 |
+
# |-------------------- seq_len ---------------------|
|
| 156 |
+
# |-- query_len ---|
|
| 157 |
+
hidden_states = causal_conv1d_fn(
|
| 158 |
+
hidden_states,
|
| 159 |
+
conv_weights,
|
| 160 |
+
self.conv1d.bias,
|
| 161 |
+
activation=self.activation,
|
| 162 |
+
conv_states=mamba_cache_params.conv_state,
|
| 163 |
+
has_initial_state=attn_metadata.context_lens_tensor > 0,
|
| 164 |
+
cache_indices=mamba_cache_params.state_indices_tensor,
|
| 165 |
+
query_start_loc=attn_metadata.query_start_loc)
|
| 166 |
+
else:
|
| 167 |
+
hidden_states = causal_conv1d_update(
|
| 168 |
+
hidden_states.transpose(0, 1),
|
| 169 |
+
mamba_cache_params.conv_state,
|
| 170 |
+
conv_weights,
|
| 171 |
+
self.conv1d.bias,
|
| 172 |
+
self.activation,
|
| 173 |
+
conv_state_indices=mamba_cache_params.state_indices_tensor)
|
| 174 |
+
hidden_states = hidden_states.transpose(0, 1)
|
| 175 |
+
|
| 176 |
+
# 3. State Space Model sequence transformation
|
| 177 |
+
# 3.a. input varying initialization of time_step, B and C
|
| 178 |
+
|
| 179 |
+
if self.is_lora_enabled:
|
| 180 |
+
# lora kernel requires contiguous tensor
|
| 181 |
+
ssm_parameters = self.x_proj(
|
| 182 |
+
hidden_states.transpose(-2, -1).contiguous())[0]
|
| 183 |
+
else:
|
| 184 |
+
ssm_parameters = self.x_proj(hidden_states.transpose(-2, -1))[0]
|
| 185 |
+
|
| 186 |
+
time_step, B, C = torch.split(
|
| 187 |
+
ssm_parameters,
|
| 188 |
+
[self.time_step_rank, self.ssm_state_size, self.ssm_state_size],
|
| 189 |
+
dim=-1,
|
| 190 |
+
)
|
| 191 |
+
if self.use_rms_norm:
|
| 192 |
+
assert self.dt_layernorm is not None
|
| 193 |
+
assert self.b_layernorm is not None
|
| 194 |
+
assert self.c_layernorm is not None
|
| 195 |
+
time_step = self.dt_layernorm(time_step.contiguous())
|
| 196 |
+
B = self.b_layernorm(B.contiguous())
|
| 197 |
+
C = self.c_layernorm(C.contiguous())
|
| 198 |
+
|
| 199 |
+
discrete_time_step = self.dt_proj(time_step)[0].transpose(-2, -1)
|
| 200 |
+
# 3.c perform the recurrence y ← SSM(A, B, C)(x)
|
| 201 |
+
time_proj_bias = (self.dt_proj.bias.float() if hasattr(
|
| 202 |
+
self.dt_proj, "bias") else None)
|
| 203 |
+
|
| 204 |
+
if attn_metadata.query_start_loc is not None \
|
| 205 |
+
and attn_metadata.context_lens_tensor is not None:
|
| 206 |
+
scan_outputs = selective_scan_fn(
|
| 207 |
+
hidden_states,
|
| 208 |
+
mamba_cache_params.ssm_state,
|
| 209 |
+
discrete_time_step,
|
| 210 |
+
self.A,
|
| 211 |
+
B.transpose(-2, -1),
|
| 212 |
+
C.transpose(-2, -1),
|
| 213 |
+
self.D.float(),
|
| 214 |
+
gate,
|
| 215 |
+
time_proj_bias,
|
| 216 |
+
delta_softplus=True,
|
| 217 |
+
cache_indices=mamba_cache_params.state_indices_tensor,
|
| 218 |
+
has_initial_state=attn_metadata.context_lens_tensor > 0,
|
| 219 |
+
query_start_loc=attn_metadata.query_start_loc)
|
| 220 |
+
else:
|
| 221 |
+
scan_outputs = selective_state_update(
|
| 222 |
+
mamba_cache_params.ssm_state,
|
| 223 |
+
hidden_states.transpose(0, 1),
|
| 224 |
+
discrete_time_step.transpose(0, 1),
|
| 225 |
+
self.A,
|
| 226 |
+
B,
|
| 227 |
+
C,
|
| 228 |
+
self.D,
|
| 229 |
+
gate.transpose(0, 1),
|
| 230 |
+
time_proj_bias,
|
| 231 |
+
dt_softplus=True,
|
| 232 |
+
state_batch_indices=mamba_cache_params.state_indices_tensor)
|
| 233 |
+
scan_outputs = scan_outputs.transpose(0, 1)
|
| 234 |
+
|
| 235 |
+
# 4. Final linear projection
|
| 236 |
+
if self.is_lora_enabled:
|
| 237 |
+
# lora kernel requires contiguous tensor
|
| 238 |
+
contextualized_states = self.out_proj(
|
| 239 |
+
scan_outputs.transpose(-2, -1).contiguous())[0]
|
| 240 |
+
else:
|
| 241 |
+
contextualized_states = self.out_proj(
|
| 242 |
+
scan_outputs.transpose(-2, -1))[0]
|
| 243 |
+
return contextualized_states
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/mamba/ops/__init__.py
ADDED
|
File without changes
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/mamba/ops/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (209 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/mamba/ops/__pycache__/causal_conv1d.cpython-311.pyc
ADDED
|
Binary file (4.83 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/mamba/ops/__pycache__/mamba_ssm.cpython-311.pyc
ADDED
|
Binary file (19.2 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
# Copyright (c) 2024, Tri Dao.
|
| 4 |
+
# Adapted from https://github.com/Dao-AILab/causal-conv1d/blob/main/causal_conv1d/causal_conv1d_interface.py
|
| 5 |
+
|
| 6 |
+
from typing import Optional
|
| 7 |
+
|
| 8 |
+
import torch
|
| 9 |
+
|
| 10 |
+
from vllm import _custom_ops as ops
|
| 11 |
+
from vllm.attention.backends.utils import PAD_SLOT_ID
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def causal_conv1d_fn(x: torch.Tensor,
|
| 15 |
+
weight: torch.Tensor,
|
| 16 |
+
bias: Optional[torch.Tensor] = None,
|
| 17 |
+
query_start_loc: Optional[torch.Tensor] = None,
|
| 18 |
+
cache_indices: Optional[torch.Tensor] = None,
|
| 19 |
+
has_initial_state: Optional[torch.Tensor] = None,
|
| 20 |
+
conv_states: Optional[torch.Tensor] = None,
|
| 21 |
+
activation: Optional[str] = "silu",
|
| 22 |
+
pad_slot_id: int = PAD_SLOT_ID):
|
| 23 |
+
"""
|
| 24 |
+
x: (batch, dim, seqlen) or (dim,cu_seq_len) for varlen
|
| 25 |
+
sequences are concatenated from left to right for varlen
|
| 26 |
+
weight: (dim, width)
|
| 27 |
+
bias: (dim,)
|
| 28 |
+
query_start_loc: (batch + 1) int32
|
| 29 |
+
The cumulative sequence lengths of the sequences in
|
| 30 |
+
the batch, used to index into sequence. prepended by 0.
|
| 31 |
+
for example: query_start_loc = torch.Tensor([0,10,16,17]),
|
| 32 |
+
x.shape=(dim,17)
|
| 33 |
+
cache_indices: (batch) int32
|
| 34 |
+
indicates the corresponding state index,
|
| 35 |
+
like so: conv_state = conv_states[cache_indices[batch_id]]
|
| 36 |
+
has_initial_state: (batch) bool
|
| 37 |
+
indicates whether should the kernel take the current state as initial
|
| 38 |
+
state for the calculations
|
| 39 |
+
conv_states: (...,dim,width - 1) itype
|
| 40 |
+
updated inplace if provided
|
| 41 |
+
activation: either None or "silu" or "swish"
|
| 42 |
+
pad_slot_id: int
|
| 43 |
+
if cache_indices is passed, lets the kernel identify padded
|
| 44 |
+
entries that will not be processed,
|
| 45 |
+
for example: cache_indices = [pad_slot_id, 1, 20, pad_slot_id]
|
| 46 |
+
in this case, the kernel will not process entries at
|
| 47 |
+
indices 0 and 3
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
out: (batch, dim, seqlen)
|
| 51 |
+
"""
|
| 52 |
+
if activation not in [None, "silu", "swish"]:
|
| 53 |
+
raise NotImplementedError("activation must be None, silu, or swish")
|
| 54 |
+
if x.stride(-1) != 1:
|
| 55 |
+
x = x.contiguous()
|
| 56 |
+
bias = bias.contiguous() if bias is not None else None
|
| 57 |
+
|
| 58 |
+
ops.causal_conv1d_fwd(x, weight, bias, conv_states, query_start_loc,
|
| 59 |
+
cache_indices, has_initial_state, activation
|
| 60 |
+
in ["silu", "swish"], pad_slot_id)
|
| 61 |
+
return x
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def causal_conv1d_update(x: torch.Tensor,
|
| 65 |
+
conv_state: torch.Tensor,
|
| 66 |
+
weight: torch.Tensor,
|
| 67 |
+
bias: Optional[torch.Tensor] = None,
|
| 68 |
+
activation: Optional[str] = None,
|
| 69 |
+
cache_seqlens: Optional[torch.Tensor] = None,
|
| 70 |
+
conv_state_indices: Optional[torch.Tensor] = None,
|
| 71 |
+
pad_slot_id: int = PAD_SLOT_ID):
|
| 72 |
+
"""
|
| 73 |
+
x: (batch, dim) or (batch, dim, seqlen)
|
| 74 |
+
conv_state: (batch, dim, state_len), where state_len >= width - 1
|
| 75 |
+
weight: (dim, width)
|
| 76 |
+
bias: (dim,)
|
| 77 |
+
cache_seqlens: (batch,), dtype int32.
|
| 78 |
+
If not None, the conv_state is treated as a circular buffer.
|
| 79 |
+
The conv_state will be updated by copying x to the conv_state
|
| 80 |
+
starting at the index
|
| 81 |
+
@cache_seqlens % state_len.
|
| 82 |
+
conv_state_indices: (batch,), dtype int32
|
| 83 |
+
If not None, the conv_state is a larger tensor along the batch dim,
|
| 84 |
+
and we are selecting the batch coords specified by conv_state_indices.
|
| 85 |
+
Useful for a continuous batching scenario.
|
| 86 |
+
pad_slot_id: int
|
| 87 |
+
if cache_indices is passed, lets the kernel identify padded
|
| 88 |
+
entries that will not be processed,
|
| 89 |
+
for example: cache_indices = [pad_slot_id, 1 ,20 ,pad_slot_id]
|
| 90 |
+
in this case, the kernel will not process entries at
|
| 91 |
+
indices 0 and 3
|
| 92 |
+
out: (batch, dim) or (batch, dim, seqlen)
|
| 93 |
+
"""
|
| 94 |
+
if activation not in [None, "silu", "swish"]:
|
| 95 |
+
raise NotImplementedError("activation must be None, silu, or swish")
|
| 96 |
+
activation_val = activation in ["silu", "swish"]
|
| 97 |
+
unsqueeze = x.dim() == 2
|
| 98 |
+
if unsqueeze:
|
| 99 |
+
x = x.unsqueeze(-1)
|
| 100 |
+
ops.causal_conv1d_update(x, conv_state, weight, bias, activation_val,
|
| 101 |
+
cache_seqlens, conv_state_indices, pad_slot_id)
|
| 102 |
+
if unsqueeze:
|
| 103 |
+
x = x.squeeze(-1)
|
| 104 |
+
return x
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
ADDED
|
@@ -0,0 +1,413 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
# Copyright (c) 2024, Tri Dao, Albert Gu.
|
| 4 |
+
# Adapted from https://github.com/state-spaces/mamba/blob/main/mamba_ssm/ops/triton/selective_state_update.py
|
| 5 |
+
|
| 6 |
+
import torch
|
| 7 |
+
import triton
|
| 8 |
+
import triton.language as tl
|
| 9 |
+
from packaging import version
|
| 10 |
+
|
| 11 |
+
from vllm import _custom_ops as ops
|
| 12 |
+
from vllm.attention.backends.utils import PAD_SLOT_ID
|
| 13 |
+
|
| 14 |
+
TRITON3 = version.parse(triton.__version__) >= version.parse("3.0.0")
|
| 15 |
+
|
| 16 |
+
if TRITON3:
|
| 17 |
+
|
| 18 |
+
@triton.jit
|
| 19 |
+
def softplus(dt):
|
| 20 |
+
dt = tl.where(dt <= 20.0, tl.math.log(tl.math.exp(dt) + 1), dt)
|
| 21 |
+
return dt
|
| 22 |
+
else:
|
| 23 |
+
|
| 24 |
+
@triton.jit
|
| 25 |
+
def softplus(dt):
|
| 26 |
+
dt = tl.where(dt <= 20.0, tl.math.log1p(tl.exp(dt)), dt)
|
| 27 |
+
return dt
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
@triton.heuristics(
|
| 31 |
+
{"HAS_DT_BIAS": lambda args: args["dt_bias_ptr"] is not None})
|
| 32 |
+
@triton.heuristics({"HAS_D": lambda args: args["D_ptr"] is not None})
|
| 33 |
+
@triton.heuristics({"HAS_Z": lambda args: args["z_ptr"] is not None})
|
| 34 |
+
@triton.heuristics({
|
| 35 |
+
"HAS_STATE_BATCH_INDICES":
|
| 36 |
+
lambda args: args["state_batch_indices_ptr"] is not None
|
| 37 |
+
})
|
| 38 |
+
@triton.heuristics(
|
| 39 |
+
{"BLOCK_SIZE_DSTATE": lambda args: triton.next_power_of_2(args["dstate"])})
|
| 40 |
+
@triton.jit
|
| 41 |
+
def _selective_scan_update_kernel(
|
| 42 |
+
# Pointers to matrices
|
| 43 |
+
state_ptr,
|
| 44 |
+
x_ptr,
|
| 45 |
+
dt_ptr,
|
| 46 |
+
dt_bias_ptr,
|
| 47 |
+
A_ptr,
|
| 48 |
+
B_ptr,
|
| 49 |
+
C_ptr,
|
| 50 |
+
D_ptr,
|
| 51 |
+
z_ptr,
|
| 52 |
+
out_ptr,
|
| 53 |
+
state_batch_indices_ptr,
|
| 54 |
+
pad_slot_id,
|
| 55 |
+
# Matrix dimensions
|
| 56 |
+
batch,
|
| 57 |
+
nheads,
|
| 58 |
+
dim,
|
| 59 |
+
dstate,
|
| 60 |
+
nheads_ngroups_ratio,
|
| 61 |
+
# Strides
|
| 62 |
+
stride_state_batch,
|
| 63 |
+
stride_state_head,
|
| 64 |
+
stride_state_dim,
|
| 65 |
+
stride_state_dstate,
|
| 66 |
+
stride_x_batch,
|
| 67 |
+
stride_x_head,
|
| 68 |
+
stride_x_dim,
|
| 69 |
+
stride_dt_batch,
|
| 70 |
+
stride_dt_head,
|
| 71 |
+
stride_dt_dim,
|
| 72 |
+
stride_dt_bias_head,
|
| 73 |
+
stride_dt_bias_dim,
|
| 74 |
+
stride_A_head,
|
| 75 |
+
stride_A_dim,
|
| 76 |
+
stride_A_dstate,
|
| 77 |
+
stride_B_batch,
|
| 78 |
+
stride_B_group,
|
| 79 |
+
stride_B_dstate,
|
| 80 |
+
stride_C_batch,
|
| 81 |
+
stride_C_group,
|
| 82 |
+
stride_C_dstate,
|
| 83 |
+
stride_D_head,
|
| 84 |
+
stride_D_dim,
|
| 85 |
+
stride_z_batch,
|
| 86 |
+
stride_z_head,
|
| 87 |
+
stride_z_dim,
|
| 88 |
+
stride_out_batch,
|
| 89 |
+
stride_out_head,
|
| 90 |
+
stride_out_dim,
|
| 91 |
+
# Meta-parameters
|
| 92 |
+
DT_SOFTPLUS: tl.constexpr,
|
| 93 |
+
TIE_HDIM: tl.constexpr,
|
| 94 |
+
BLOCK_SIZE_M: tl.constexpr,
|
| 95 |
+
HAS_DT_BIAS: tl.constexpr,
|
| 96 |
+
HAS_D: tl.constexpr,
|
| 97 |
+
HAS_Z: tl.constexpr,
|
| 98 |
+
HAS_STATE_BATCH_INDICES: tl.constexpr,
|
| 99 |
+
BLOCK_SIZE_DSTATE: tl.constexpr,
|
| 100 |
+
):
|
| 101 |
+
pid_m = tl.program_id(axis=0)
|
| 102 |
+
pid_b = tl.program_id(axis=1)
|
| 103 |
+
pid_h = tl.program_id(axis=2)
|
| 104 |
+
|
| 105 |
+
# If HAS_STATE_BATCH_INDICES is true, then the ssm state's batch coordinate
|
| 106 |
+
# is taken from the state_batch_indices_ptr Otherwise, the state coordinate
|
| 107 |
+
# is the same as the batch id.
|
| 108 |
+
if HAS_STATE_BATCH_INDICES:
|
| 109 |
+
state_batch_indices_ptr += pid_b
|
| 110 |
+
state_batch_idx = tl.load(state_batch_indices_ptr)
|
| 111 |
+
state_ptr += (state_batch_idx * stride_state_batch +
|
| 112 |
+
pid_h * stride_state_head)
|
| 113 |
+
else:
|
| 114 |
+
state_ptr += pid_b * stride_state_batch + pid_h * stride_state_head
|
| 115 |
+
|
| 116 |
+
x_ptr += pid_b * stride_x_batch + pid_h * stride_x_head
|
| 117 |
+
dt_ptr += pid_b * stride_dt_batch + pid_h * stride_dt_head
|
| 118 |
+
if HAS_DT_BIAS:
|
| 119 |
+
dt_bias_ptr += pid_h * stride_dt_bias_head
|
| 120 |
+
A_ptr += pid_h * stride_A_head
|
| 121 |
+
B_ptr += pid_b * stride_B_batch + (pid_h //
|
| 122 |
+
nheads_ngroups_ratio) * stride_B_group
|
| 123 |
+
C_ptr += pid_b * stride_C_batch + (pid_h //
|
| 124 |
+
nheads_ngroups_ratio) * stride_C_group
|
| 125 |
+
if HAS_Z:
|
| 126 |
+
z_ptr += pid_b * stride_z_batch + pid_h * stride_z_head
|
| 127 |
+
out_ptr += pid_b * stride_out_batch + pid_h * stride_out_head
|
| 128 |
+
|
| 129 |
+
offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
|
| 130 |
+
offs_n = tl.arange(0, BLOCK_SIZE_DSTATE)
|
| 131 |
+
state_ptrs = state_ptr + (offs_m[:, None] * stride_state_dim +
|
| 132 |
+
offs_n[None, :] * stride_state_dstate)
|
| 133 |
+
x_ptrs = x_ptr + offs_m * stride_x_dim
|
| 134 |
+
dt_ptrs = dt_ptr + offs_m * stride_dt_dim
|
| 135 |
+
if HAS_DT_BIAS:
|
| 136 |
+
dt_bias_ptrs = dt_bias_ptr + offs_m * stride_dt_bias_dim
|
| 137 |
+
if HAS_D:
|
| 138 |
+
D_ptr += pid_h * stride_D_head
|
| 139 |
+
A_ptrs = A_ptr + (offs_m[:, None] * stride_A_dim +
|
| 140 |
+
offs_n[None, :] * stride_A_dstate)
|
| 141 |
+
B_ptrs = B_ptr + offs_n * stride_B_dstate
|
| 142 |
+
C_ptrs = C_ptr + offs_n * stride_C_dstate
|
| 143 |
+
if HAS_D:
|
| 144 |
+
D_ptrs = D_ptr + offs_m * stride_D_dim
|
| 145 |
+
if HAS_Z:
|
| 146 |
+
z_ptrs = z_ptr + offs_m * stride_z_dim
|
| 147 |
+
out_ptrs = out_ptr + offs_m * stride_out_dim
|
| 148 |
+
mask = (offs_m[:, None] < dim) & (offs_n[None, :] < dstate)
|
| 149 |
+
if HAS_STATE_BATCH_INDICES:
|
| 150 |
+
mask &= (state_batch_idx != pad_slot_id)
|
| 151 |
+
state = tl.load(state_ptrs, mask=mask, other=0.0)
|
| 152 |
+
|
| 153 |
+
x = tl.load(x_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
|
| 154 |
+
if not TIE_HDIM:
|
| 155 |
+
dt = tl.load(dt_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
|
| 156 |
+
if HAS_DT_BIAS:
|
| 157 |
+
dt += tl.load(dt_bias_ptrs, mask=offs_m < dim,
|
| 158 |
+
other=0.0).to(tl.float32)
|
| 159 |
+
if DT_SOFTPLUS:
|
| 160 |
+
dt = softplus(dt)
|
| 161 |
+
A = tl.load(A_ptrs,
|
| 162 |
+
mask=(offs_m[:, None] < dim) & (offs_n[None, :] < dstate),
|
| 163 |
+
other=0.0).to(tl.float32)
|
| 164 |
+
dA = tl.exp(A * dt[:, None])
|
| 165 |
+
else:
|
| 166 |
+
dt = tl.load(dt_ptr).to(tl.float32)
|
| 167 |
+
if HAS_DT_BIAS:
|
| 168 |
+
dt += tl.load(dt_bias_ptr).to(tl.float32)
|
| 169 |
+
if DT_SOFTPLUS:
|
| 170 |
+
dt = softplus(dt)
|
| 171 |
+
A = tl.load(A_ptr).to(tl.float32)
|
| 172 |
+
dA = tl.exp(A * dt) # scalar, not a matrix
|
| 173 |
+
|
| 174 |
+
B = tl.load(B_ptrs, mask=offs_n < dstate, other=0.0).to(tl.float32)
|
| 175 |
+
C = tl.load(C_ptrs, mask=offs_n < dstate, other=0.0).to(tl.float32)
|
| 176 |
+
if HAS_D:
|
| 177 |
+
D = tl.load(D_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
|
| 178 |
+
if HAS_Z:
|
| 179 |
+
z = tl.load(z_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
|
| 180 |
+
|
| 181 |
+
dB = B[None, :] * dt[:, None] if not TIE_HDIM else B * dt
|
| 182 |
+
state = state * dA + dB * x[:, None]
|
| 183 |
+
|
| 184 |
+
mask = (offs_m[:, None] < dim) & (offs_n[None, :] < dstate)
|
| 185 |
+
if HAS_STATE_BATCH_INDICES:
|
| 186 |
+
mask &= (state_batch_idx != pad_slot_id)
|
| 187 |
+
tl.store(state_ptrs, state, mask=mask)
|
| 188 |
+
out = tl.sum(state * C[None, :], axis=1)
|
| 189 |
+
if HAS_D:
|
| 190 |
+
out += x * D
|
| 191 |
+
if HAS_Z:
|
| 192 |
+
out *= z * tl.sigmoid(z)
|
| 193 |
+
tl.store(out_ptrs, out, mask=offs_m < dim)
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
def selective_state_update(state,
|
| 197 |
+
x,
|
| 198 |
+
dt,
|
| 199 |
+
A,
|
| 200 |
+
B,
|
| 201 |
+
C,
|
| 202 |
+
D=None,
|
| 203 |
+
z=None,
|
| 204 |
+
dt_bias=None,
|
| 205 |
+
dt_softplus=False,
|
| 206 |
+
state_batch_indices=None,
|
| 207 |
+
pad_slot_id=PAD_SLOT_ID):
|
| 208 |
+
"""
|
| 209 |
+
Argument:
|
| 210 |
+
state: (batch, dim, dstate) or (batch, nheads, dim, dstate)
|
| 211 |
+
x: (batch, dim) or (batch, nheads, dim)
|
| 212 |
+
dt: (batch, dim) or (batch, nheads, dim)
|
| 213 |
+
A: (dim, dstate) or (nheads, dim, dstate)
|
| 214 |
+
B: (batch, dstate) or (batch, ngroups, dstate)
|
| 215 |
+
C: (batch, dstate) or (batch, ngroups, dstate)
|
| 216 |
+
D: (dim,) or (nheads, dim)
|
| 217 |
+
z: (batch, dim) or (batch, nheads, dim)
|
| 218 |
+
dt_bias: (dim,) or (nheads, dim)
|
| 219 |
+
pad_slot_id: int
|
| 220 |
+
if cache_indices is passed, lets the kernel identify padded
|
| 221 |
+
entries that will not be processed,
|
| 222 |
+
for example: cache_indices = [pad_slot_id, 1, 20, pad_slot_id]
|
| 223 |
+
in this case, the kernel will not process entries at
|
| 224 |
+
indices 0 and 3
|
| 225 |
+
Return:
|
| 226 |
+
out: (batch, dim) or (batch, nheads, dim)
|
| 227 |
+
"""
|
| 228 |
+
has_heads = state.dim() > 3
|
| 229 |
+
if state.dim() == 3:
|
| 230 |
+
state = state.unsqueeze(1)
|
| 231 |
+
if x.dim() == 2:
|
| 232 |
+
x = x.unsqueeze(1)
|
| 233 |
+
if dt.dim() == 2:
|
| 234 |
+
dt = dt.unsqueeze(1)
|
| 235 |
+
if A.dim() == 2:
|
| 236 |
+
A = A.unsqueeze(0)
|
| 237 |
+
if B.dim() == 2:
|
| 238 |
+
B = B.unsqueeze(1)
|
| 239 |
+
if C.dim() == 2:
|
| 240 |
+
C = C.unsqueeze(1)
|
| 241 |
+
if D is not None and D.dim() == 1:
|
| 242 |
+
D = D.unsqueeze(0)
|
| 243 |
+
if z is not None and z.dim() == 2:
|
| 244 |
+
z = z.unsqueeze(1)
|
| 245 |
+
if dt_bias is not None and dt_bias.dim() == 1:
|
| 246 |
+
dt_bias = dt_bias.unsqueeze(0)
|
| 247 |
+
|
| 248 |
+
_, nheads, dim, dstate = state.shape
|
| 249 |
+
batch = x.shape[0]
|
| 250 |
+
|
| 251 |
+
assert x.shape == (batch, nheads, dim)
|
| 252 |
+
assert dt.shape == x.shape
|
| 253 |
+
assert A.shape == (nheads, dim, dstate)
|
| 254 |
+
ngroups = B.shape[1]
|
| 255 |
+
assert nheads % ngroups == 0, "nheads must be divisible by ngroups"
|
| 256 |
+
assert B.shape == (batch, ngroups, dstate)
|
| 257 |
+
assert C.shape == B.shape
|
| 258 |
+
if D is not None:
|
| 259 |
+
assert D.shape == (nheads, dim)
|
| 260 |
+
if z is not None:
|
| 261 |
+
assert z.shape == x.shape
|
| 262 |
+
if dt_bias is not None:
|
| 263 |
+
assert dt_bias.shape == (nheads, dim)
|
| 264 |
+
if state_batch_indices is not None:
|
| 265 |
+
assert state_batch_indices.shape == (batch, )
|
| 266 |
+
out = torch.empty_like(x)
|
| 267 |
+
grid = lambda META: (triton.cdiv(dim, META['BLOCK_SIZE_M']), batch, nheads)
|
| 268 |
+
z_strides = ((z.stride(0), z.stride(1), z.stride(2)) if z is not None else
|
| 269 |
+
(0, 0, 0))
|
| 270 |
+
# We don't want autotune since it will overwrite the state
|
| 271 |
+
# We instead tune by hand.
|
| 272 |
+
BLOCK_SIZE_M, num_warps = ((32, 4) if dstate <= 16 else
|
| 273 |
+
((16, 4) if dstate <= 32 else
|
| 274 |
+
((8, 4) if dstate <= 64 else
|
| 275 |
+
((4, 4) if dstate <= 128 else ((4, 8))))))
|
| 276 |
+
tie_hdim = A.stride(-1) == 0 and A.stride(-2) == 0 and dt.stride(
|
| 277 |
+
-1) == 0 and dt_bias.stride(-1) == 0
|
| 278 |
+
with torch.cuda.device(x.device.index):
|
| 279 |
+
_selective_scan_update_kernel[grid](
|
| 280 |
+
state,
|
| 281 |
+
x,
|
| 282 |
+
dt,
|
| 283 |
+
dt_bias,
|
| 284 |
+
A,
|
| 285 |
+
B,
|
| 286 |
+
C,
|
| 287 |
+
D,
|
| 288 |
+
z,
|
| 289 |
+
out,
|
| 290 |
+
state_batch_indices,
|
| 291 |
+
pad_slot_id,
|
| 292 |
+
batch,
|
| 293 |
+
nheads,
|
| 294 |
+
dim,
|
| 295 |
+
dstate,
|
| 296 |
+
nheads // ngroups,
|
| 297 |
+
state.stride(0),
|
| 298 |
+
state.stride(1),
|
| 299 |
+
state.stride(2),
|
| 300 |
+
state.stride(3),
|
| 301 |
+
x.stride(0),
|
| 302 |
+
x.stride(1),
|
| 303 |
+
x.stride(2),
|
| 304 |
+
dt.stride(0),
|
| 305 |
+
dt.stride(1),
|
| 306 |
+
dt.stride(2),
|
| 307 |
+
*(dt_bias.stride(0),
|
| 308 |
+
dt_bias.stride(1)) if dt_bias is not None else 0,
|
| 309 |
+
A.stride(0),
|
| 310 |
+
A.stride(1),
|
| 311 |
+
A.stride(2),
|
| 312 |
+
B.stride(0),
|
| 313 |
+
B.stride(1),
|
| 314 |
+
B.stride(2),
|
| 315 |
+
C.stride(0),
|
| 316 |
+
C.stride(1),
|
| 317 |
+
C.stride(2),
|
| 318 |
+
*(D.stride(0), D.stride(1)) if D is not None else 0,
|
| 319 |
+
z_strides[0],
|
| 320 |
+
z_strides[1],
|
| 321 |
+
z_strides[2],
|
| 322 |
+
out.stride(0),
|
| 323 |
+
out.stride(1),
|
| 324 |
+
out.stride(2),
|
| 325 |
+
dt_softplus,
|
| 326 |
+
tie_hdim,
|
| 327 |
+
BLOCK_SIZE_M,
|
| 328 |
+
num_warps=num_warps,
|
| 329 |
+
)
|
| 330 |
+
if not has_heads:
|
| 331 |
+
out = out.squeeze(1)
|
| 332 |
+
return out
|
| 333 |
+
|
| 334 |
+
|
| 335 |
+
def selective_scan_fn(u,
|
| 336 |
+
ssm_states,
|
| 337 |
+
delta,
|
| 338 |
+
A,
|
| 339 |
+
B,
|
| 340 |
+
C,
|
| 341 |
+
D=None,
|
| 342 |
+
z=None,
|
| 343 |
+
delta_bias=None,
|
| 344 |
+
delta_softplus=False,
|
| 345 |
+
query_start_loc=None,
|
| 346 |
+
cache_indices=None,
|
| 347 |
+
has_initial_state=None,
|
| 348 |
+
pad_slot_id=PAD_SLOT_ID) -> torch.Tensor:
|
| 349 |
+
"""
|
| 350 |
+
u: (dim, total_length) for varlen or (batch, dim, seqlen)
|
| 351 |
+
applies changes in place.
|
| 352 |
+
ssm_states: (batch, dim, dstate) or (batch, nheads, dim, dstate)
|
| 353 |
+
applies changes in place.
|
| 354 |
+
delta: (dim, total_length) for varlen or (batch, dim, seqlen)
|
| 355 |
+
A: (dim, dstate)
|
| 356 |
+
B: (ngroups, dstate, total_length) for varlen or
|
| 357 |
+
(batch,ngroups,dstate,seqlen)
|
| 358 |
+
C: (ngroups, dstate, total_length) for varlen or
|
| 359 |
+
(batch,ngroups,dstate,seqlen)
|
| 360 |
+
D: (dim,)
|
| 361 |
+
z: (dim, total_length) for varlen or (batch, dim, seqlen)
|
| 362 |
+
dt_bias: (dim,) or (dim)
|
| 363 |
+
query_start_loc: (batch + 1) int32
|
| 364 |
+
The cumulative sequence lengths of the sequences in
|
| 365 |
+
the batch, used to index into sequence. prepended with 0.
|
| 366 |
+
for example: query_start_loc = torch.Tensor([0,10,16,17]),
|
| 367 |
+
x.shape=(dim,17)
|
| 368 |
+
cache_indices: (batch) int32
|
| 369 |
+
A tensor with each cell is a correspondent
|
| 370 |
+
input and output ssm_state index
|
| 371 |
+
has_initial_state: (batch) bool
|
| 372 |
+
A tensor populated with ones and zeros,
|
| 373 |
+
indicate if the ssm_state at the corresponding index should be
|
| 374 |
+
used as initial state. Not providing argument assumes
|
| 375 |
+
there's no initial state
|
| 376 |
+
pad_slot_id: int
|
| 377 |
+
if cache_indices is passed, lets the kernel identify padding entries
|
| 378 |
+
that will not be processed,
|
| 379 |
+
for example: cache_indices = [pad_slot_id, 1 ,20 ,pad_slot_id]
|
| 380 |
+
in this case, the kernel will not process entries at indices 0 and 3
|
| 381 |
+
returns
|
| 382 |
+
output: (dim, total_length) for varlen or (batch, dim, seqlen)
|
| 383 |
+
supports inplace replacement
|
| 384 |
+
"""
|
| 385 |
+
if u.stride(-1) != 1:
|
| 386 |
+
u = u.contiguous()
|
| 387 |
+
if delta.stride(-1) != 1:
|
| 388 |
+
delta = delta.contiguous()
|
| 389 |
+
if D is not None:
|
| 390 |
+
D = D.contiguous()
|
| 391 |
+
if B.stride(-1) != 1:
|
| 392 |
+
B = B.contiguous()
|
| 393 |
+
if C.stride(-1) != 1:
|
| 394 |
+
C = C.contiguous()
|
| 395 |
+
if z is not None and z.stride(-1) != 1:
|
| 396 |
+
z = z.contiguous()
|
| 397 |
+
if B.dim() == 3 and query_start_loc is None:
|
| 398 |
+
B = B.unsqueeze(1)
|
| 399 |
+
if B.dim() == 2 and query_start_loc is not None:
|
| 400 |
+
B = B.unsqueeze(0)
|
| 401 |
+
if C.dim() == 3 and query_start_loc is None:
|
| 402 |
+
C = C.unsqueeze(1)
|
| 403 |
+
if C.dim() == 2 and query_start_loc is not None:
|
| 404 |
+
C = C.unsqueeze(0)
|
| 405 |
+
|
| 406 |
+
ops.selective_scan_fwd(u, delta, A, B, C, D, z, delta_bias, delta_softplus,
|
| 407 |
+
query_start_loc, cache_indices, has_initial_state,
|
| 408 |
+
ssm_states, pad_slot_id)
|
| 409 |
+
|
| 410 |
+
if z is None:
|
| 411 |
+
return delta # output written inplace to delta
|
| 412 |
+
else:
|
| 413 |
+
return z # output written inplace to z
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (362 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/__pycache__/fp8_utils.cpython-311.pyc
ADDED
|
Binary file (23.9 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/__pycache__/machete_utils.cpython-311.pyc
ADDED
|
Binary file (1.85 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils_fp8.cpython-311.pyc
ADDED
|
Binary file (5.04 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils_test.cpython-311.pyc
ADDED
|
Binary file (7.94 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils_test_24.cpython-311.pyc
ADDED
|
Binary file (21.3 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/__pycache__/marlin_utils_test_qqq.cpython-311.pyc
ADDED
|
Binary file (6.86 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/__pycache__/quant_utils.cpython-311.pyc
ADDED
|
Binary file (27.6 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/__pycache__/w8a8_utils.cpython-311.pyc
ADDED
|
Binary file (8.69 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"1": {
|
| 3 |
+
"BLOCK_SIZE_M": 64,
|
| 4 |
+
"BLOCK_SIZE_N": 64,
|
| 5 |
+
"BLOCK_SIZE_K": 128,
|
| 6 |
+
"GROUP_SIZE_M": 1,
|
| 7 |
+
"num_warps": 4,
|
| 8 |
+
"num_stages": 4
|
| 9 |
+
},
|
| 10 |
+
"2": {
|
| 11 |
+
"BLOCK_SIZE_M": 64,
|
| 12 |
+
"BLOCK_SIZE_N": 32,
|
| 13 |
+
"BLOCK_SIZE_K": 128,
|
| 14 |
+
"GROUP_SIZE_M": 16,
|
| 15 |
+
"num_warps": 4,
|
| 16 |
+
"num_stages": 4
|
| 17 |
+
},
|
| 18 |
+
"4": {
|
| 19 |
+
"BLOCK_SIZE_M": 64,
|
| 20 |
+
"BLOCK_SIZE_N": 32,
|
| 21 |
+
"BLOCK_SIZE_K": 128,
|
| 22 |
+
"GROUP_SIZE_M": 1,
|
| 23 |
+
"num_warps": 4,
|
| 24 |
+
"num_stages": 4
|
| 25 |
+
},
|
| 26 |
+
"8": {
|
| 27 |
+
"BLOCK_SIZE_M": 64,
|
| 28 |
+
"BLOCK_SIZE_N": 32,
|
| 29 |
+
"BLOCK_SIZE_K": 128,
|
| 30 |
+
"GROUP_SIZE_M": 1,
|
| 31 |
+
"num_warps": 4,
|
| 32 |
+
"num_stages": 4
|
| 33 |
+
},
|
| 34 |
+
"16": {
|
| 35 |
+
"BLOCK_SIZE_M": 64,
|
| 36 |
+
"BLOCK_SIZE_N": 32,
|
| 37 |
+
"BLOCK_SIZE_K": 128,
|
| 38 |
+
"GROUP_SIZE_M": 1,
|
| 39 |
+
"num_warps": 4,
|
| 40 |
+
"num_stages": 4
|
| 41 |
+
},
|
| 42 |
+
"24": {
|
| 43 |
+
"BLOCK_SIZE_M": 64,
|
| 44 |
+
"BLOCK_SIZE_N": 32,
|
| 45 |
+
"BLOCK_SIZE_K": 128,
|
| 46 |
+
"GROUP_SIZE_M": 16,
|
| 47 |
+
"num_warps": 4,
|
| 48 |
+
"num_stages": 4
|
| 49 |
+
},
|
| 50 |
+
"32": {
|
| 51 |
+
"BLOCK_SIZE_M": 64,
|
| 52 |
+
"BLOCK_SIZE_N": 32,
|
| 53 |
+
"BLOCK_SIZE_K": 128,
|
| 54 |
+
"GROUP_SIZE_M": 16,
|
| 55 |
+
"num_warps": 4,
|
| 56 |
+
"num_stages": 4
|
| 57 |
+
},
|
| 58 |
+
"48": {
|
| 59 |
+
"BLOCK_SIZE_M": 64,
|
| 60 |
+
"BLOCK_SIZE_N": 32,
|
| 61 |
+
"BLOCK_SIZE_K": 128,
|
| 62 |
+
"GROUP_SIZE_M": 1,
|
| 63 |
+
"num_warps": 4,
|
| 64 |
+
"num_stages": 4
|
| 65 |
+
},
|
| 66 |
+
"64": {
|
| 67 |
+
"BLOCK_SIZE_M": 64,
|
| 68 |
+
"BLOCK_SIZE_N": 32,
|
| 69 |
+
"BLOCK_SIZE_K": 128,
|
| 70 |
+
"GROUP_SIZE_M": 1,
|
| 71 |
+
"num_warps": 4,
|
| 72 |
+
"num_stages": 4
|
| 73 |
+
},
|
| 74 |
+
"96": {
|
| 75 |
+
"BLOCK_SIZE_M": 64,
|
| 76 |
+
"BLOCK_SIZE_N": 32,
|
| 77 |
+
"BLOCK_SIZE_K": 128,
|
| 78 |
+
"GROUP_SIZE_M": 1,
|
| 79 |
+
"num_warps": 4,
|
| 80 |
+
"num_stages": 4
|
| 81 |
+
},
|
| 82 |
+
"128": {
|
| 83 |
+
"BLOCK_SIZE_M": 64,
|
| 84 |
+
"BLOCK_SIZE_N": 32,
|
| 85 |
+
"BLOCK_SIZE_K": 128,
|
| 86 |
+
"GROUP_SIZE_M": 1,
|
| 87 |
+
"num_warps": 4,
|
| 88 |
+
"num_stages": 4
|
| 89 |
+
},
|
| 90 |
+
"256": {
|
| 91 |
+
"BLOCK_SIZE_M": 64,
|
| 92 |
+
"BLOCK_SIZE_N": 32,
|
| 93 |
+
"BLOCK_SIZE_K": 128,
|
| 94 |
+
"GROUP_SIZE_M": 16,
|
| 95 |
+
"num_warps": 4,
|
| 96 |
+
"num_stages": 5
|
| 97 |
+
},
|
| 98 |
+
"512": {
|
| 99 |
+
"BLOCK_SIZE_M": 64,
|
| 100 |
+
"BLOCK_SIZE_N": 32,
|
| 101 |
+
"BLOCK_SIZE_K": 128,
|
| 102 |
+
"GROUP_SIZE_M": 16,
|
| 103 |
+
"num_warps": 4,
|
| 104 |
+
"num_stages": 3
|
| 105 |
+
},
|
| 106 |
+
"1024": {
|
| 107 |
+
"BLOCK_SIZE_M": 64,
|
| 108 |
+
"BLOCK_SIZE_N": 64,
|
| 109 |
+
"BLOCK_SIZE_K": 128,
|
| 110 |
+
"GROUP_SIZE_M": 1,
|
| 111 |
+
"num_warps": 4,
|
| 112 |
+
"num_stages": 3
|
| 113 |
+
},
|
| 114 |
+
"1536": {
|
| 115 |
+
"BLOCK_SIZE_M": 64,
|
| 116 |
+
"BLOCK_SIZE_N": 64,
|
| 117 |
+
"BLOCK_SIZE_K": 128,
|
| 118 |
+
"GROUP_SIZE_M": 16,
|
| 119 |
+
"num_warps": 4,
|
| 120 |
+
"num_stages": 3
|
| 121 |
+
},
|
| 122 |
+
"2048": {
|
| 123 |
+
"BLOCK_SIZE_M": 64,
|
| 124 |
+
"BLOCK_SIZE_N": 64,
|
| 125 |
+
"BLOCK_SIZE_K": 128,
|
| 126 |
+
"GROUP_SIZE_M": 16,
|
| 127 |
+
"num_warps": 4,
|
| 128 |
+
"num_stages": 3
|
| 129 |
+
},
|
| 130 |
+
"3072": {
|
| 131 |
+
"BLOCK_SIZE_M": 64,
|
| 132 |
+
"BLOCK_SIZE_N": 64,
|
| 133 |
+
"BLOCK_SIZE_K": 128,
|
| 134 |
+
"GROUP_SIZE_M": 16,
|
| 135 |
+
"num_warps": 4,
|
| 136 |
+
"num_stages": 3
|
| 137 |
+
},
|
| 138 |
+
"4096": {
|
| 139 |
+
"BLOCK_SIZE_M": 64,
|
| 140 |
+
"BLOCK_SIZE_N": 128,
|
| 141 |
+
"BLOCK_SIZE_K": 128,
|
| 142 |
+
"GROUP_SIZE_M": 16,
|
| 143 |
+
"num_warps": 4,
|
| 144 |
+
"num_stages": 3
|
| 145 |
+
}
|
| 146 |
+
}
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"1": {
|
| 3 |
+
"BLOCK_SIZE_M": 64,
|
| 4 |
+
"BLOCK_SIZE_N": 64,
|
| 5 |
+
"BLOCK_SIZE_K": 128,
|
| 6 |
+
"GROUP_SIZE_M": 32,
|
| 7 |
+
"num_warps": 4,
|
| 8 |
+
"num_stages": 4
|
| 9 |
+
},
|
| 10 |
+
"2": {
|
| 11 |
+
"BLOCK_SIZE_M": 64,
|
| 12 |
+
"BLOCK_SIZE_N": 32,
|
| 13 |
+
"BLOCK_SIZE_K": 128,
|
| 14 |
+
"GROUP_SIZE_M": 1,
|
| 15 |
+
"num_warps": 4,
|
| 16 |
+
"num_stages": 5
|
| 17 |
+
},
|
| 18 |
+
"4": {
|
| 19 |
+
"BLOCK_SIZE_M": 64,
|
| 20 |
+
"BLOCK_SIZE_N": 32,
|
| 21 |
+
"BLOCK_SIZE_K": 128,
|
| 22 |
+
"GROUP_SIZE_M": 16,
|
| 23 |
+
"num_warps": 4,
|
| 24 |
+
"num_stages": 5
|
| 25 |
+
},
|
| 26 |
+
"8": {
|
| 27 |
+
"BLOCK_SIZE_M": 64,
|
| 28 |
+
"BLOCK_SIZE_N": 32,
|
| 29 |
+
"BLOCK_SIZE_K": 128,
|
| 30 |
+
"GROUP_SIZE_M": 1,
|
| 31 |
+
"num_warps": 4,
|
| 32 |
+
"num_stages": 4
|
| 33 |
+
},
|
| 34 |
+
"16": {
|
| 35 |
+
"BLOCK_SIZE_M": 64,
|
| 36 |
+
"BLOCK_SIZE_N": 32,
|
| 37 |
+
"BLOCK_SIZE_K": 128,
|
| 38 |
+
"GROUP_SIZE_M": 1,
|
| 39 |
+
"num_warps": 4,
|
| 40 |
+
"num_stages": 4
|
| 41 |
+
},
|
| 42 |
+
"24": {
|
| 43 |
+
"BLOCK_SIZE_M": 64,
|
| 44 |
+
"BLOCK_SIZE_N": 32,
|
| 45 |
+
"BLOCK_SIZE_K": 128,
|
| 46 |
+
"GROUP_SIZE_M": 1,
|
| 47 |
+
"num_warps": 4,
|
| 48 |
+
"num_stages": 5
|
| 49 |
+
},
|
| 50 |
+
"32": {
|
| 51 |
+
"BLOCK_SIZE_M": 64,
|
| 52 |
+
"BLOCK_SIZE_N": 32,
|
| 53 |
+
"BLOCK_SIZE_K": 128,
|
| 54 |
+
"GROUP_SIZE_M": 1,
|
| 55 |
+
"num_warps": 4,
|
| 56 |
+
"num_stages": 4
|
| 57 |
+
},
|
| 58 |
+
"48": {
|
| 59 |
+
"BLOCK_SIZE_M": 64,
|
| 60 |
+
"BLOCK_SIZE_N": 32,
|
| 61 |
+
"BLOCK_SIZE_K": 128,
|
| 62 |
+
"GROUP_SIZE_M": 16,
|
| 63 |
+
"num_warps": 4,
|
| 64 |
+
"num_stages": 4
|
| 65 |
+
},
|
| 66 |
+
"64": {
|
| 67 |
+
"BLOCK_SIZE_M": 64,
|
| 68 |
+
"BLOCK_SIZE_N": 32,
|
| 69 |
+
"BLOCK_SIZE_K": 128,
|
| 70 |
+
"GROUP_SIZE_M": 1,
|
| 71 |
+
"num_warps": 4,
|
| 72 |
+
"num_stages": 4
|
| 73 |
+
},
|
| 74 |
+
"96": {
|
| 75 |
+
"BLOCK_SIZE_M": 64,
|
| 76 |
+
"BLOCK_SIZE_N": 32,
|
| 77 |
+
"BLOCK_SIZE_K": 128,
|
| 78 |
+
"GROUP_SIZE_M": 1,
|
| 79 |
+
"num_warps": 4,
|
| 80 |
+
"num_stages": 5
|
| 81 |
+
},
|
| 82 |
+
"128": {
|
| 83 |
+
"BLOCK_SIZE_M": 64,
|
| 84 |
+
"BLOCK_SIZE_N": 32,
|
| 85 |
+
"BLOCK_SIZE_K": 128,
|
| 86 |
+
"GROUP_SIZE_M": 1,
|
| 87 |
+
"num_warps": 4,
|
| 88 |
+
"num_stages": 5
|
| 89 |
+
},
|
| 90 |
+
"256": {
|
| 91 |
+
"BLOCK_SIZE_M": 64,
|
| 92 |
+
"BLOCK_SIZE_N": 64,
|
| 93 |
+
"BLOCK_SIZE_K": 128,
|
| 94 |
+
"GROUP_SIZE_M": 16,
|
| 95 |
+
"num_warps": 4,
|
| 96 |
+
"num_stages": 4
|
| 97 |
+
},
|
| 98 |
+
"512": {
|
| 99 |
+
"BLOCK_SIZE_M": 64,
|
| 100 |
+
"BLOCK_SIZE_N": 32,
|
| 101 |
+
"BLOCK_SIZE_K": 128,
|
| 102 |
+
"GROUP_SIZE_M": 1,
|
| 103 |
+
"num_warps": 4,
|
| 104 |
+
"num_stages": 3
|
| 105 |
+
},
|
| 106 |
+
"1024": {
|
| 107 |
+
"BLOCK_SIZE_M": 64,
|
| 108 |
+
"BLOCK_SIZE_N": 64,
|
| 109 |
+
"BLOCK_SIZE_K": 128,
|
| 110 |
+
"GROUP_SIZE_M": 16,
|
| 111 |
+
"num_warps": 4,
|
| 112 |
+
"num_stages": 3
|
| 113 |
+
},
|
| 114 |
+
"1536": {
|
| 115 |
+
"BLOCK_SIZE_M": 64,
|
| 116 |
+
"BLOCK_SIZE_N": 64,
|
| 117 |
+
"BLOCK_SIZE_K": 128,
|
| 118 |
+
"GROUP_SIZE_M": 1,
|
| 119 |
+
"num_warps": 4,
|
| 120 |
+
"num_stages": 3
|
| 121 |
+
},
|
| 122 |
+
"2048": {
|
| 123 |
+
"BLOCK_SIZE_M": 64,
|
| 124 |
+
"BLOCK_SIZE_N": 64,
|
| 125 |
+
"BLOCK_SIZE_K": 128,
|
| 126 |
+
"GROUP_SIZE_M": 1,
|
| 127 |
+
"num_warps": 4,
|
| 128 |
+
"num_stages": 3
|
| 129 |
+
},
|
| 130 |
+
"3072": {
|
| 131 |
+
"BLOCK_SIZE_M": 64,
|
| 132 |
+
"BLOCK_SIZE_N": 64,
|
| 133 |
+
"BLOCK_SIZE_K": 128,
|
| 134 |
+
"GROUP_SIZE_M": 1,
|
| 135 |
+
"num_warps": 4,
|
| 136 |
+
"num_stages": 3
|
| 137 |
+
},
|
| 138 |
+
"4096": {
|
| 139 |
+
"BLOCK_SIZE_M": 64,
|
| 140 |
+
"BLOCK_SIZE_N": 128,
|
| 141 |
+
"BLOCK_SIZE_K": 128,
|
| 142 |
+
"GROUP_SIZE_M": 16,
|
| 143 |
+
"num_warps": 4,
|
| 144 |
+
"num_stages": 3
|
| 145 |
+
}
|
| 146 |
+
}
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"1": {
|
| 3 |
+
"BLOCK_SIZE_M": 64,
|
| 4 |
+
"BLOCK_SIZE_N": 32,
|
| 5 |
+
"BLOCK_SIZE_K": 128,
|
| 6 |
+
"GROUP_SIZE_M": 64,
|
| 7 |
+
"num_warps": 4,
|
| 8 |
+
"num_stages": 5
|
| 9 |
+
},
|
| 10 |
+
"2": {
|
| 11 |
+
"BLOCK_SIZE_M": 64,
|
| 12 |
+
"BLOCK_SIZE_N": 32,
|
| 13 |
+
"BLOCK_SIZE_K": 128,
|
| 14 |
+
"GROUP_SIZE_M": 1,
|
| 15 |
+
"num_warps": 4,
|
| 16 |
+
"num_stages": 5
|
| 17 |
+
},
|
| 18 |
+
"4": {
|
| 19 |
+
"BLOCK_SIZE_M": 64,
|
| 20 |
+
"BLOCK_SIZE_N": 32,
|
| 21 |
+
"BLOCK_SIZE_K": 128,
|
| 22 |
+
"GROUP_SIZE_M": 1,
|
| 23 |
+
"num_warps": 4,
|
| 24 |
+
"num_stages": 4
|
| 25 |
+
},
|
| 26 |
+
"8": {
|
| 27 |
+
"BLOCK_SIZE_M": 64,
|
| 28 |
+
"BLOCK_SIZE_N": 32,
|
| 29 |
+
"BLOCK_SIZE_K": 128,
|
| 30 |
+
"GROUP_SIZE_M": 64,
|
| 31 |
+
"num_warps": 4,
|
| 32 |
+
"num_stages": 4
|
| 33 |
+
},
|
| 34 |
+
"16": {
|
| 35 |
+
"BLOCK_SIZE_M": 64,
|
| 36 |
+
"BLOCK_SIZE_N": 32,
|
| 37 |
+
"BLOCK_SIZE_K": 128,
|
| 38 |
+
"GROUP_SIZE_M": 64,
|
| 39 |
+
"num_warps": 4,
|
| 40 |
+
"num_stages": 4
|
| 41 |
+
},
|
| 42 |
+
"24": {
|
| 43 |
+
"BLOCK_SIZE_M": 64,
|
| 44 |
+
"BLOCK_SIZE_N": 32,
|
| 45 |
+
"BLOCK_SIZE_K": 128,
|
| 46 |
+
"GROUP_SIZE_M": 64,
|
| 47 |
+
"num_warps": 4,
|
| 48 |
+
"num_stages": 4
|
| 49 |
+
},
|
| 50 |
+
"32": {
|
| 51 |
+
"BLOCK_SIZE_M": 64,
|
| 52 |
+
"BLOCK_SIZE_N": 32,
|
| 53 |
+
"BLOCK_SIZE_K": 128,
|
| 54 |
+
"GROUP_SIZE_M": 32,
|
| 55 |
+
"num_warps": 4,
|
| 56 |
+
"num_stages": 4
|
| 57 |
+
},
|
| 58 |
+
"48": {
|
| 59 |
+
"BLOCK_SIZE_M": 64,
|
| 60 |
+
"BLOCK_SIZE_N": 32,
|
| 61 |
+
"BLOCK_SIZE_K": 128,
|
| 62 |
+
"GROUP_SIZE_M": 1,
|
| 63 |
+
"num_warps": 4,
|
| 64 |
+
"num_stages": 4
|
| 65 |
+
},
|
| 66 |
+
"64": {
|
| 67 |
+
"BLOCK_SIZE_M": 64,
|
| 68 |
+
"BLOCK_SIZE_N": 32,
|
| 69 |
+
"BLOCK_SIZE_K": 128,
|
| 70 |
+
"GROUP_SIZE_M": 16,
|
| 71 |
+
"num_warps": 4,
|
| 72 |
+
"num_stages": 4
|
| 73 |
+
},
|
| 74 |
+
"96": {
|
| 75 |
+
"BLOCK_SIZE_M": 64,
|
| 76 |
+
"BLOCK_SIZE_N": 32,
|
| 77 |
+
"BLOCK_SIZE_K": 128,
|
| 78 |
+
"GROUP_SIZE_M": 16,
|
| 79 |
+
"num_warps": 4,
|
| 80 |
+
"num_stages": 4
|
| 81 |
+
},
|
| 82 |
+
"128": {
|
| 83 |
+
"BLOCK_SIZE_M": 64,
|
| 84 |
+
"BLOCK_SIZE_N": 32,
|
| 85 |
+
"BLOCK_SIZE_K": 128,
|
| 86 |
+
"GROUP_SIZE_M": 32,
|
| 87 |
+
"num_warps": 4,
|
| 88 |
+
"num_stages": 4
|
| 89 |
+
},
|
| 90 |
+
"256": {
|
| 91 |
+
"BLOCK_SIZE_M": 64,
|
| 92 |
+
"BLOCK_SIZE_N": 64,
|
| 93 |
+
"BLOCK_SIZE_K": 128,
|
| 94 |
+
"GROUP_SIZE_M": 32,
|
| 95 |
+
"num_warps": 4,
|
| 96 |
+
"num_stages": 5
|
| 97 |
+
},
|
| 98 |
+
"512": {
|
| 99 |
+
"BLOCK_SIZE_M": 64,
|
| 100 |
+
"BLOCK_SIZE_N": 32,
|
| 101 |
+
"BLOCK_SIZE_K": 128,
|
| 102 |
+
"GROUP_SIZE_M": 1,
|
| 103 |
+
"num_warps": 4,
|
| 104 |
+
"num_stages": 3
|
| 105 |
+
},
|
| 106 |
+
"1024": {
|
| 107 |
+
"BLOCK_SIZE_M": 64,
|
| 108 |
+
"BLOCK_SIZE_N": 64,
|
| 109 |
+
"BLOCK_SIZE_K": 128,
|
| 110 |
+
"GROUP_SIZE_M": 32,
|
| 111 |
+
"num_warps": 4,
|
| 112 |
+
"num_stages": 3
|
| 113 |
+
},
|
| 114 |
+
"1536": {
|
| 115 |
+
"BLOCK_SIZE_M": 64,
|
| 116 |
+
"BLOCK_SIZE_N": 64,
|
| 117 |
+
"BLOCK_SIZE_K": 128,
|
| 118 |
+
"GROUP_SIZE_M": 64,
|
| 119 |
+
"num_warps": 4,
|
| 120 |
+
"num_stages": 3
|
| 121 |
+
},
|
| 122 |
+
"2048": {
|
| 123 |
+
"BLOCK_SIZE_M": 64,
|
| 124 |
+
"BLOCK_SIZE_N": 64,
|
| 125 |
+
"BLOCK_SIZE_K": 128,
|
| 126 |
+
"GROUP_SIZE_M": 16,
|
| 127 |
+
"num_warps": 4,
|
| 128 |
+
"num_stages": 3
|
| 129 |
+
},
|
| 130 |
+
"3072": {
|
| 131 |
+
"BLOCK_SIZE_M": 64,
|
| 132 |
+
"BLOCK_SIZE_N": 64,
|
| 133 |
+
"BLOCK_SIZE_K": 128,
|
| 134 |
+
"GROUP_SIZE_M": 64,
|
| 135 |
+
"num_warps": 4,
|
| 136 |
+
"num_stages": 3
|
| 137 |
+
},
|
| 138 |
+
"4096": {
|
| 139 |
+
"BLOCK_SIZE_M": 64,
|
| 140 |
+
"BLOCK_SIZE_N": 128,
|
| 141 |
+
"BLOCK_SIZE_K": 128,
|
| 142 |
+
"GROUP_SIZE_M": 16,
|
| 143 |
+
"num_warps": 4,
|
| 144 |
+
"num_stages": 3
|
| 145 |
+
}
|
| 146 |
+
}
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"1": {
|
| 3 |
+
"BLOCK_SIZE_M": 64,
|
| 4 |
+
"BLOCK_SIZE_N": 32,
|
| 5 |
+
"BLOCK_SIZE_K": 128,
|
| 6 |
+
"GROUP_SIZE_M": 1,
|
| 7 |
+
"num_warps": 4,
|
| 8 |
+
"num_stages": 5
|
| 9 |
+
},
|
| 10 |
+
"2": {
|
| 11 |
+
"BLOCK_SIZE_M": 64,
|
| 12 |
+
"BLOCK_SIZE_N": 32,
|
| 13 |
+
"BLOCK_SIZE_K": 128,
|
| 14 |
+
"GROUP_SIZE_M": 32,
|
| 15 |
+
"num_warps": 4,
|
| 16 |
+
"num_stages": 5
|
| 17 |
+
},
|
| 18 |
+
"4": {
|
| 19 |
+
"BLOCK_SIZE_M": 64,
|
| 20 |
+
"BLOCK_SIZE_N": 32,
|
| 21 |
+
"BLOCK_SIZE_K": 128,
|
| 22 |
+
"GROUP_SIZE_M": 1,
|
| 23 |
+
"num_warps": 4,
|
| 24 |
+
"num_stages": 5
|
| 25 |
+
},
|
| 26 |
+
"8": {
|
| 27 |
+
"BLOCK_SIZE_M": 64,
|
| 28 |
+
"BLOCK_SIZE_N": 32,
|
| 29 |
+
"BLOCK_SIZE_K": 128,
|
| 30 |
+
"GROUP_SIZE_M": 1,
|
| 31 |
+
"num_warps": 4,
|
| 32 |
+
"num_stages": 4
|
| 33 |
+
},
|
| 34 |
+
"16": {
|
| 35 |
+
"BLOCK_SIZE_M": 64,
|
| 36 |
+
"BLOCK_SIZE_N": 32,
|
| 37 |
+
"BLOCK_SIZE_K": 128,
|
| 38 |
+
"GROUP_SIZE_M": 1,
|
| 39 |
+
"num_warps": 4,
|
| 40 |
+
"num_stages": 4
|
| 41 |
+
},
|
| 42 |
+
"24": {
|
| 43 |
+
"BLOCK_SIZE_M": 64,
|
| 44 |
+
"BLOCK_SIZE_N": 32,
|
| 45 |
+
"BLOCK_SIZE_K": 128,
|
| 46 |
+
"GROUP_SIZE_M": 1,
|
| 47 |
+
"num_warps": 4,
|
| 48 |
+
"num_stages": 4
|
| 49 |
+
},
|
| 50 |
+
"32": {
|
| 51 |
+
"BLOCK_SIZE_M": 64,
|
| 52 |
+
"BLOCK_SIZE_N": 32,
|
| 53 |
+
"BLOCK_SIZE_K": 128,
|
| 54 |
+
"GROUP_SIZE_M": 16,
|
| 55 |
+
"num_warps": 4,
|
| 56 |
+
"num_stages": 4
|
| 57 |
+
},
|
| 58 |
+
"48": {
|
| 59 |
+
"BLOCK_SIZE_M": 64,
|
| 60 |
+
"BLOCK_SIZE_N": 32,
|
| 61 |
+
"BLOCK_SIZE_K": 128,
|
| 62 |
+
"GROUP_SIZE_M": 32,
|
| 63 |
+
"num_warps": 4,
|
| 64 |
+
"num_stages": 4
|
| 65 |
+
},
|
| 66 |
+
"64": {
|
| 67 |
+
"BLOCK_SIZE_M": 64,
|
| 68 |
+
"BLOCK_SIZE_N": 32,
|
| 69 |
+
"BLOCK_SIZE_K": 128,
|
| 70 |
+
"GROUP_SIZE_M": 1,
|
| 71 |
+
"num_warps": 4,
|
| 72 |
+
"num_stages": 4
|
| 73 |
+
},
|
| 74 |
+
"96": {
|
| 75 |
+
"BLOCK_SIZE_M": 64,
|
| 76 |
+
"BLOCK_SIZE_N": 32,
|
| 77 |
+
"BLOCK_SIZE_K": 128,
|
| 78 |
+
"GROUP_SIZE_M": 32,
|
| 79 |
+
"num_warps": 4,
|
| 80 |
+
"num_stages": 4
|
| 81 |
+
},
|
| 82 |
+
"128": {
|
| 83 |
+
"BLOCK_SIZE_M": 64,
|
| 84 |
+
"BLOCK_SIZE_N": 32,
|
| 85 |
+
"BLOCK_SIZE_K": 128,
|
| 86 |
+
"GROUP_SIZE_M": 1,
|
| 87 |
+
"num_warps": 4,
|
| 88 |
+
"num_stages": 4
|
| 89 |
+
},
|
| 90 |
+
"256": {
|
| 91 |
+
"BLOCK_SIZE_M": 64,
|
| 92 |
+
"BLOCK_SIZE_N": 64,
|
| 93 |
+
"BLOCK_SIZE_K": 128,
|
| 94 |
+
"GROUP_SIZE_M": 16,
|
| 95 |
+
"num_warps": 4,
|
| 96 |
+
"num_stages": 4
|
| 97 |
+
},
|
| 98 |
+
"512": {
|
| 99 |
+
"BLOCK_SIZE_M": 64,
|
| 100 |
+
"BLOCK_SIZE_N": 64,
|
| 101 |
+
"BLOCK_SIZE_K": 128,
|
| 102 |
+
"GROUP_SIZE_M": 64,
|
| 103 |
+
"num_warps": 4,
|
| 104 |
+
"num_stages": 3
|
| 105 |
+
},
|
| 106 |
+
"1024": {
|
| 107 |
+
"BLOCK_SIZE_M": 64,
|
| 108 |
+
"BLOCK_SIZE_N": 128,
|
| 109 |
+
"BLOCK_SIZE_K": 128,
|
| 110 |
+
"GROUP_SIZE_M": 64,
|
| 111 |
+
"num_warps": 4,
|
| 112 |
+
"num_stages": 3
|
| 113 |
+
},
|
| 114 |
+
"1536": {
|
| 115 |
+
"BLOCK_SIZE_M": 64,
|
| 116 |
+
"BLOCK_SIZE_N": 64,
|
| 117 |
+
"BLOCK_SIZE_K": 128,
|
| 118 |
+
"GROUP_SIZE_M": 16,
|
| 119 |
+
"num_warps": 4,
|
| 120 |
+
"num_stages": 3
|
| 121 |
+
},
|
| 122 |
+
"2048": {
|
| 123 |
+
"BLOCK_SIZE_M": 64,
|
| 124 |
+
"BLOCK_SIZE_N": 128,
|
| 125 |
+
"BLOCK_SIZE_K": 128,
|
| 126 |
+
"GROUP_SIZE_M": 16,
|
| 127 |
+
"num_warps": 4,
|
| 128 |
+
"num_stages": 3
|
| 129 |
+
},
|
| 130 |
+
"3072": {
|
| 131 |
+
"BLOCK_SIZE_M": 64,
|
| 132 |
+
"BLOCK_SIZE_N": 128,
|
| 133 |
+
"BLOCK_SIZE_K": 128,
|
| 134 |
+
"GROUP_SIZE_M": 16,
|
| 135 |
+
"num_warps": 4,
|
| 136 |
+
"num_stages": 3
|
| 137 |
+
},
|
| 138 |
+
"4096": {
|
| 139 |
+
"BLOCK_SIZE_M": 64,
|
| 140 |
+
"BLOCK_SIZE_N": 128,
|
| 141 |
+
"BLOCK_SIZE_K": 128,
|
| 142 |
+
"GROUP_SIZE_M": 16,
|
| 143 |
+
"num_warps": 4,
|
| 144 |
+
"num_stages": 3
|
| 145 |
+
}
|
| 146 |
+
}
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"1": {
|
| 3 |
+
"BLOCK_SIZE_M": 64,
|
| 4 |
+
"BLOCK_SIZE_N": 32,
|
| 5 |
+
"BLOCK_SIZE_K": 128,
|
| 6 |
+
"GROUP_SIZE_M": 1,
|
| 7 |
+
"num_warps": 4,
|
| 8 |
+
"num_stages": 5
|
| 9 |
+
},
|
| 10 |
+
"2": {
|
| 11 |
+
"BLOCK_SIZE_M": 64,
|
| 12 |
+
"BLOCK_SIZE_N": 32,
|
| 13 |
+
"BLOCK_SIZE_K": 128,
|
| 14 |
+
"GROUP_SIZE_M": 32,
|
| 15 |
+
"num_warps": 4,
|
| 16 |
+
"num_stages": 5
|
| 17 |
+
},
|
| 18 |
+
"4": {
|
| 19 |
+
"BLOCK_SIZE_M": 64,
|
| 20 |
+
"BLOCK_SIZE_N": 32,
|
| 21 |
+
"BLOCK_SIZE_K": 128,
|
| 22 |
+
"GROUP_SIZE_M": 1,
|
| 23 |
+
"num_warps": 4,
|
| 24 |
+
"num_stages": 5
|
| 25 |
+
},
|
| 26 |
+
"8": {
|
| 27 |
+
"BLOCK_SIZE_M": 64,
|
| 28 |
+
"BLOCK_SIZE_N": 32,
|
| 29 |
+
"BLOCK_SIZE_K": 128,
|
| 30 |
+
"GROUP_SIZE_M": 1,
|
| 31 |
+
"num_warps": 4,
|
| 32 |
+
"num_stages": 4
|
| 33 |
+
},
|
| 34 |
+
"16": {
|
| 35 |
+
"BLOCK_SIZE_M": 64,
|
| 36 |
+
"BLOCK_SIZE_N": 32,
|
| 37 |
+
"BLOCK_SIZE_K": 128,
|
| 38 |
+
"GROUP_SIZE_M": 1,
|
| 39 |
+
"num_warps": 4,
|
| 40 |
+
"num_stages": 4
|
| 41 |
+
},
|
| 42 |
+
"24": {
|
| 43 |
+
"BLOCK_SIZE_M": 64,
|
| 44 |
+
"BLOCK_SIZE_N": 32,
|
| 45 |
+
"BLOCK_SIZE_K": 128,
|
| 46 |
+
"GROUP_SIZE_M": 16,
|
| 47 |
+
"num_warps": 4,
|
| 48 |
+
"num_stages": 4
|
| 49 |
+
},
|
| 50 |
+
"32": {
|
| 51 |
+
"BLOCK_SIZE_M": 64,
|
| 52 |
+
"BLOCK_SIZE_N": 32,
|
| 53 |
+
"BLOCK_SIZE_K": 128,
|
| 54 |
+
"GROUP_SIZE_M": 1,
|
| 55 |
+
"num_warps": 4,
|
| 56 |
+
"num_stages": 4
|
| 57 |
+
},
|
| 58 |
+
"48": {
|
| 59 |
+
"BLOCK_SIZE_M": 64,
|
| 60 |
+
"BLOCK_SIZE_N": 32,
|
| 61 |
+
"BLOCK_SIZE_K": 128,
|
| 62 |
+
"GROUP_SIZE_M": 32,
|
| 63 |
+
"num_warps": 4,
|
| 64 |
+
"num_stages": 4
|
| 65 |
+
},
|
| 66 |
+
"64": {
|
| 67 |
+
"BLOCK_SIZE_M": 64,
|
| 68 |
+
"BLOCK_SIZE_N": 32,
|
| 69 |
+
"BLOCK_SIZE_K": 128,
|
| 70 |
+
"GROUP_SIZE_M": 16,
|
| 71 |
+
"num_warps": 4,
|
| 72 |
+
"num_stages": 4
|
| 73 |
+
},
|
| 74 |
+
"96": {
|
| 75 |
+
"BLOCK_SIZE_M": 64,
|
| 76 |
+
"BLOCK_SIZE_N": 32,
|
| 77 |
+
"BLOCK_SIZE_K": 128,
|
| 78 |
+
"GROUP_SIZE_M": 1,
|
| 79 |
+
"num_warps": 4,
|
| 80 |
+
"num_stages": 4
|
| 81 |
+
},
|
| 82 |
+
"128": {
|
| 83 |
+
"BLOCK_SIZE_M": 64,
|
| 84 |
+
"BLOCK_SIZE_N": 32,
|
| 85 |
+
"BLOCK_SIZE_K": 128,
|
| 86 |
+
"GROUP_SIZE_M": 16,
|
| 87 |
+
"num_warps": 4,
|
| 88 |
+
"num_stages": 4
|
| 89 |
+
},
|
| 90 |
+
"256": {
|
| 91 |
+
"BLOCK_SIZE_M": 64,
|
| 92 |
+
"BLOCK_SIZE_N": 64,
|
| 93 |
+
"BLOCK_SIZE_K": 128,
|
| 94 |
+
"GROUP_SIZE_M": 32,
|
| 95 |
+
"num_warps": 4,
|
| 96 |
+
"num_stages": 4
|
| 97 |
+
},
|
| 98 |
+
"512": {
|
| 99 |
+
"BLOCK_SIZE_M": 64,
|
| 100 |
+
"BLOCK_SIZE_N": 64,
|
| 101 |
+
"BLOCK_SIZE_K": 128,
|
| 102 |
+
"GROUP_SIZE_M": 32,
|
| 103 |
+
"num_warps": 4,
|
| 104 |
+
"num_stages": 3
|
| 105 |
+
},
|
| 106 |
+
"1024": {
|
| 107 |
+
"BLOCK_SIZE_M": 64,
|
| 108 |
+
"BLOCK_SIZE_N": 128,
|
| 109 |
+
"BLOCK_SIZE_K": 128,
|
| 110 |
+
"GROUP_SIZE_M": 64,
|
| 111 |
+
"num_warps": 4,
|
| 112 |
+
"num_stages": 3
|
| 113 |
+
},
|
| 114 |
+
"1536": {
|
| 115 |
+
"BLOCK_SIZE_M": 64,
|
| 116 |
+
"BLOCK_SIZE_N": 64,
|
| 117 |
+
"BLOCK_SIZE_K": 128,
|
| 118 |
+
"GROUP_SIZE_M": 16,
|
| 119 |
+
"num_warps": 4,
|
| 120 |
+
"num_stages": 3
|
| 121 |
+
},
|
| 122 |
+
"2048": {
|
| 123 |
+
"BLOCK_SIZE_M": 64,
|
| 124 |
+
"BLOCK_SIZE_N": 128,
|
| 125 |
+
"BLOCK_SIZE_K": 128,
|
| 126 |
+
"GROUP_SIZE_M": 1,
|
| 127 |
+
"num_warps": 4,
|
| 128 |
+
"num_stages": 3
|
| 129 |
+
},
|
| 130 |
+
"3072": {
|
| 131 |
+
"BLOCK_SIZE_M": 64,
|
| 132 |
+
"BLOCK_SIZE_N": 128,
|
| 133 |
+
"BLOCK_SIZE_K": 128,
|
| 134 |
+
"GROUP_SIZE_M": 16,
|
| 135 |
+
"num_warps": 4,
|
| 136 |
+
"num_stages": 3
|
| 137 |
+
},
|
| 138 |
+
"4096": {
|
| 139 |
+
"BLOCK_SIZE_M": 64,
|
| 140 |
+
"BLOCK_SIZE_N": 128,
|
| 141 |
+
"BLOCK_SIZE_K": 128,
|
| 142 |
+
"GROUP_SIZE_M": 16,
|
| 143 |
+
"num_warps": 4,
|
| 144 |
+
"num_stages": 3
|
| 145 |
+
}
|
| 146 |
+
}
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"1": {
|
| 3 |
+
"BLOCK_SIZE_M": 64,
|
| 4 |
+
"BLOCK_SIZE_N": 64,
|
| 5 |
+
"BLOCK_SIZE_K": 128,
|
| 6 |
+
"GROUP_SIZE_M": 16,
|
| 7 |
+
"num_warps": 4,
|
| 8 |
+
"num_stages": 4
|
| 9 |
+
},
|
| 10 |
+
"2": {
|
| 11 |
+
"BLOCK_SIZE_M": 64,
|
| 12 |
+
"BLOCK_SIZE_N": 32,
|
| 13 |
+
"BLOCK_SIZE_K": 128,
|
| 14 |
+
"GROUP_SIZE_M": 1,
|
| 15 |
+
"num_warps": 4,
|
| 16 |
+
"num_stages": 5
|
| 17 |
+
},
|
| 18 |
+
"4": {
|
| 19 |
+
"BLOCK_SIZE_M": 64,
|
| 20 |
+
"BLOCK_SIZE_N": 32,
|
| 21 |
+
"BLOCK_SIZE_K": 128,
|
| 22 |
+
"GROUP_SIZE_M": 1,
|
| 23 |
+
"num_warps": 4,
|
| 24 |
+
"num_stages": 5
|
| 25 |
+
},
|
| 26 |
+
"8": {
|
| 27 |
+
"BLOCK_SIZE_M": 64,
|
| 28 |
+
"BLOCK_SIZE_N": 32,
|
| 29 |
+
"BLOCK_SIZE_K": 128,
|
| 30 |
+
"GROUP_SIZE_M": 1,
|
| 31 |
+
"num_warps": 4,
|
| 32 |
+
"num_stages": 5
|
| 33 |
+
},
|
| 34 |
+
"16": {
|
| 35 |
+
"BLOCK_SIZE_M": 64,
|
| 36 |
+
"BLOCK_SIZE_N": 32,
|
| 37 |
+
"BLOCK_SIZE_K": 128,
|
| 38 |
+
"GROUP_SIZE_M": 1,
|
| 39 |
+
"num_warps": 4,
|
| 40 |
+
"num_stages": 4
|
| 41 |
+
},
|
| 42 |
+
"24": {
|
| 43 |
+
"BLOCK_SIZE_M": 64,
|
| 44 |
+
"BLOCK_SIZE_N": 32,
|
| 45 |
+
"BLOCK_SIZE_K": 128,
|
| 46 |
+
"GROUP_SIZE_M": 1,
|
| 47 |
+
"num_warps": 4,
|
| 48 |
+
"num_stages": 4
|
| 49 |
+
},
|
| 50 |
+
"32": {
|
| 51 |
+
"BLOCK_SIZE_M": 64,
|
| 52 |
+
"BLOCK_SIZE_N": 32,
|
| 53 |
+
"BLOCK_SIZE_K": 128,
|
| 54 |
+
"GROUP_SIZE_M": 1,
|
| 55 |
+
"num_warps": 4,
|
| 56 |
+
"num_stages": 5
|
| 57 |
+
},
|
| 58 |
+
"48": {
|
| 59 |
+
"BLOCK_SIZE_M": 64,
|
| 60 |
+
"BLOCK_SIZE_N": 32,
|
| 61 |
+
"BLOCK_SIZE_K": 128,
|
| 62 |
+
"GROUP_SIZE_M": 16,
|
| 63 |
+
"num_warps": 4,
|
| 64 |
+
"num_stages": 4
|
| 65 |
+
},
|
| 66 |
+
"64": {
|
| 67 |
+
"BLOCK_SIZE_M": 64,
|
| 68 |
+
"BLOCK_SIZE_N": 32,
|
| 69 |
+
"BLOCK_SIZE_K": 128,
|
| 70 |
+
"GROUP_SIZE_M": 1,
|
| 71 |
+
"num_warps": 4,
|
| 72 |
+
"num_stages": 4
|
| 73 |
+
},
|
| 74 |
+
"96": {
|
| 75 |
+
"BLOCK_SIZE_M": 64,
|
| 76 |
+
"BLOCK_SIZE_N": 64,
|
| 77 |
+
"BLOCK_SIZE_K": 128,
|
| 78 |
+
"GROUP_SIZE_M": 32,
|
| 79 |
+
"num_warps": 4,
|
| 80 |
+
"num_stages": 5
|
| 81 |
+
},
|
| 82 |
+
"128": {
|
| 83 |
+
"BLOCK_SIZE_M": 64,
|
| 84 |
+
"BLOCK_SIZE_N": 64,
|
| 85 |
+
"BLOCK_SIZE_K": 128,
|
| 86 |
+
"GROUP_SIZE_M": 32,
|
| 87 |
+
"num_warps": 4,
|
| 88 |
+
"num_stages": 4
|
| 89 |
+
},
|
| 90 |
+
"256": {
|
| 91 |
+
"BLOCK_SIZE_M": 64,
|
| 92 |
+
"BLOCK_SIZE_N": 32,
|
| 93 |
+
"BLOCK_SIZE_K": 128,
|
| 94 |
+
"GROUP_SIZE_M": 1,
|
| 95 |
+
"num_warps": 4,
|
| 96 |
+
"num_stages": 4
|
| 97 |
+
},
|
| 98 |
+
"512": {
|
| 99 |
+
"BLOCK_SIZE_M": 64,
|
| 100 |
+
"BLOCK_SIZE_N": 64,
|
| 101 |
+
"BLOCK_SIZE_K": 128,
|
| 102 |
+
"GROUP_SIZE_M": 1,
|
| 103 |
+
"num_warps": 4,
|
| 104 |
+
"num_stages": 3
|
| 105 |
+
},
|
| 106 |
+
"1024": {
|
| 107 |
+
"BLOCK_SIZE_M": 64,
|
| 108 |
+
"BLOCK_SIZE_N": 64,
|
| 109 |
+
"BLOCK_SIZE_K": 128,
|
| 110 |
+
"GROUP_SIZE_M": 1,
|
| 111 |
+
"num_warps": 4,
|
| 112 |
+
"num_stages": 3
|
| 113 |
+
},
|
| 114 |
+
"1536": {
|
| 115 |
+
"BLOCK_SIZE_M": 64,
|
| 116 |
+
"BLOCK_SIZE_N": 128,
|
| 117 |
+
"BLOCK_SIZE_K": 128,
|
| 118 |
+
"GROUP_SIZE_M": 64,
|
| 119 |
+
"num_warps": 4,
|
| 120 |
+
"num_stages": 3
|
| 121 |
+
},
|
| 122 |
+
"2048": {
|
| 123 |
+
"BLOCK_SIZE_M": 64,
|
| 124 |
+
"BLOCK_SIZE_N": 64,
|
| 125 |
+
"BLOCK_SIZE_K": 128,
|
| 126 |
+
"GROUP_SIZE_M": 1,
|
| 127 |
+
"num_warps": 4,
|
| 128 |
+
"num_stages": 3
|
| 129 |
+
},
|
| 130 |
+
"3072": {
|
| 131 |
+
"BLOCK_SIZE_M": 64,
|
| 132 |
+
"BLOCK_SIZE_N": 64,
|
| 133 |
+
"BLOCK_SIZE_K": 128,
|
| 134 |
+
"GROUP_SIZE_M": 1,
|
| 135 |
+
"num_warps": 4,
|
| 136 |
+
"num_stages": 3
|
| 137 |
+
},
|
| 138 |
+
"4096": {
|
| 139 |
+
"BLOCK_SIZE_M": 64,
|
| 140 |
+
"BLOCK_SIZE_N": 128,
|
| 141 |
+
"BLOCK_SIZE_K": 128,
|
| 142 |
+
"GROUP_SIZE_M": 16,
|
| 143 |
+
"num_warps": 4,
|
| 144 |
+
"num_stages": 3
|
| 145 |
+
}
|
| 146 |
+
}
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"1": {
|
| 3 |
+
"BLOCK_SIZE_M": 64,
|
| 4 |
+
"BLOCK_SIZE_N": 64,
|
| 5 |
+
"BLOCK_SIZE_K": 128,
|
| 6 |
+
"GROUP_SIZE_M": 32,
|
| 7 |
+
"num_warps": 4,
|
| 8 |
+
"num_stages": 5
|
| 9 |
+
},
|
| 10 |
+
"2": {
|
| 11 |
+
"BLOCK_SIZE_M": 64,
|
| 12 |
+
"BLOCK_SIZE_N": 64,
|
| 13 |
+
"BLOCK_SIZE_K": 128,
|
| 14 |
+
"GROUP_SIZE_M": 1,
|
| 15 |
+
"num_warps": 4,
|
| 16 |
+
"num_stages": 5
|
| 17 |
+
},
|
| 18 |
+
"4": {
|
| 19 |
+
"BLOCK_SIZE_M": 64,
|
| 20 |
+
"BLOCK_SIZE_N": 32,
|
| 21 |
+
"BLOCK_SIZE_K": 128,
|
| 22 |
+
"GROUP_SIZE_M": 64,
|
| 23 |
+
"num_warps": 4,
|
| 24 |
+
"num_stages": 5
|
| 25 |
+
},
|
| 26 |
+
"8": {
|
| 27 |
+
"BLOCK_SIZE_M": 64,
|
| 28 |
+
"BLOCK_SIZE_N": 32,
|
| 29 |
+
"BLOCK_SIZE_K": 128,
|
| 30 |
+
"GROUP_SIZE_M": 1,
|
| 31 |
+
"num_warps": 4,
|
| 32 |
+
"num_stages": 5
|
| 33 |
+
},
|
| 34 |
+
"16": {
|
| 35 |
+
"BLOCK_SIZE_M": 64,
|
| 36 |
+
"BLOCK_SIZE_N": 32,
|
| 37 |
+
"BLOCK_SIZE_K": 128,
|
| 38 |
+
"GROUP_SIZE_M": 1,
|
| 39 |
+
"num_warps": 4,
|
| 40 |
+
"num_stages": 4
|
| 41 |
+
},
|
| 42 |
+
"24": {
|
| 43 |
+
"BLOCK_SIZE_M": 64,
|
| 44 |
+
"BLOCK_SIZE_N": 32,
|
| 45 |
+
"BLOCK_SIZE_K": 128,
|
| 46 |
+
"GROUP_SIZE_M": 16,
|
| 47 |
+
"num_warps": 4,
|
| 48 |
+
"num_stages": 4
|
| 49 |
+
},
|
| 50 |
+
"32": {
|
| 51 |
+
"BLOCK_SIZE_M": 64,
|
| 52 |
+
"BLOCK_SIZE_N": 32,
|
| 53 |
+
"BLOCK_SIZE_K": 128,
|
| 54 |
+
"GROUP_SIZE_M": 1,
|
| 55 |
+
"num_warps": 4,
|
| 56 |
+
"num_stages": 4
|
| 57 |
+
},
|
| 58 |
+
"48": {
|
| 59 |
+
"BLOCK_SIZE_M": 64,
|
| 60 |
+
"BLOCK_SIZE_N": 32,
|
| 61 |
+
"BLOCK_SIZE_K": 128,
|
| 62 |
+
"GROUP_SIZE_M": 64,
|
| 63 |
+
"num_warps": 4,
|
| 64 |
+
"num_stages": 4
|
| 65 |
+
},
|
| 66 |
+
"64": {
|
| 67 |
+
"BLOCK_SIZE_M": 64,
|
| 68 |
+
"BLOCK_SIZE_N": 32,
|
| 69 |
+
"BLOCK_SIZE_K": 128,
|
| 70 |
+
"GROUP_SIZE_M": 1,
|
| 71 |
+
"num_warps": 4,
|
| 72 |
+
"num_stages": 4
|
| 73 |
+
},
|
| 74 |
+
"96": {
|
| 75 |
+
"BLOCK_SIZE_M": 64,
|
| 76 |
+
"BLOCK_SIZE_N": 64,
|
| 77 |
+
"BLOCK_SIZE_K": 128,
|
| 78 |
+
"GROUP_SIZE_M": 32,
|
| 79 |
+
"num_warps": 4,
|
| 80 |
+
"num_stages": 5
|
| 81 |
+
},
|
| 82 |
+
"128": {
|
| 83 |
+
"BLOCK_SIZE_M": 64,
|
| 84 |
+
"BLOCK_SIZE_N": 64,
|
| 85 |
+
"BLOCK_SIZE_K": 128,
|
| 86 |
+
"GROUP_SIZE_M": 64,
|
| 87 |
+
"num_warps": 4,
|
| 88 |
+
"num_stages": 4
|
| 89 |
+
},
|
| 90 |
+
"256": {
|
| 91 |
+
"BLOCK_SIZE_M": 64,
|
| 92 |
+
"BLOCK_SIZE_N": 32,
|
| 93 |
+
"BLOCK_SIZE_K": 128,
|
| 94 |
+
"GROUP_SIZE_M": 1,
|
| 95 |
+
"num_warps": 4,
|
| 96 |
+
"num_stages": 4
|
| 97 |
+
},
|
| 98 |
+
"512": {
|
| 99 |
+
"BLOCK_SIZE_M": 64,
|
| 100 |
+
"BLOCK_SIZE_N": 64,
|
| 101 |
+
"BLOCK_SIZE_K": 128,
|
| 102 |
+
"GROUP_SIZE_M": 1,
|
| 103 |
+
"num_warps": 4,
|
| 104 |
+
"num_stages": 3
|
| 105 |
+
},
|
| 106 |
+
"1024": {
|
| 107 |
+
"BLOCK_SIZE_M": 64,
|
| 108 |
+
"BLOCK_SIZE_N": 64,
|
| 109 |
+
"BLOCK_SIZE_K": 128,
|
| 110 |
+
"GROUP_SIZE_M": 64,
|
| 111 |
+
"num_warps": 4,
|
| 112 |
+
"num_stages": 3
|
| 113 |
+
},
|
| 114 |
+
"1536": {
|
| 115 |
+
"BLOCK_SIZE_M": 64,
|
| 116 |
+
"BLOCK_SIZE_N": 128,
|
| 117 |
+
"BLOCK_SIZE_K": 128,
|
| 118 |
+
"GROUP_SIZE_M": 32,
|
| 119 |
+
"num_warps": 4,
|
| 120 |
+
"num_stages": 3
|
| 121 |
+
},
|
| 122 |
+
"2048": {
|
| 123 |
+
"BLOCK_SIZE_M": 64,
|
| 124 |
+
"BLOCK_SIZE_N": 64,
|
| 125 |
+
"BLOCK_SIZE_K": 128,
|
| 126 |
+
"GROUP_SIZE_M": 32,
|
| 127 |
+
"num_warps": 4,
|
| 128 |
+
"num_stages": 3
|
| 129 |
+
},
|
| 130 |
+
"3072": {
|
| 131 |
+
"BLOCK_SIZE_M": 64,
|
| 132 |
+
"BLOCK_SIZE_N": 64,
|
| 133 |
+
"BLOCK_SIZE_K": 128,
|
| 134 |
+
"GROUP_SIZE_M": 64,
|
| 135 |
+
"num_warps": 4,
|
| 136 |
+
"num_stages": 3
|
| 137 |
+
},
|
| 138 |
+
"4096": {
|
| 139 |
+
"BLOCK_SIZE_M": 64,
|
| 140 |
+
"BLOCK_SIZE_N": 128,
|
| 141 |
+
"BLOCK_SIZE_K": 128,
|
| 142 |
+
"GROUP_SIZE_M": 16,
|
| 143 |
+
"num_warps": 4,
|
| 144 |
+
"num_stages": 3
|
| 145 |
+
}
|
| 146 |
+
}
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"1": {
|
| 3 |
+
"BLOCK_SIZE_M": 16,
|
| 4 |
+
"BLOCK_SIZE_N": 64,
|
| 5 |
+
"BLOCK_SIZE_K": 128,
|
| 6 |
+
"GROUP_SIZE_M": 16,
|
| 7 |
+
"num_warps": 4,
|
| 8 |
+
"num_stages": 3
|
| 9 |
+
},
|
| 10 |
+
"2": {
|
| 11 |
+
"BLOCK_SIZE_M": 16,
|
| 12 |
+
"BLOCK_SIZE_N": 64,
|
| 13 |
+
"BLOCK_SIZE_K": 128,
|
| 14 |
+
"GROUP_SIZE_M": 1,
|
| 15 |
+
"num_warps": 4,
|
| 16 |
+
"num_stages": 3
|
| 17 |
+
},
|
| 18 |
+
"4": {
|
| 19 |
+
"BLOCK_SIZE_M": 16,
|
| 20 |
+
"BLOCK_SIZE_N": 64,
|
| 21 |
+
"BLOCK_SIZE_K": 128,
|
| 22 |
+
"GROUP_SIZE_M": 64,
|
| 23 |
+
"num_warps": 4,
|
| 24 |
+
"num_stages": 3
|
| 25 |
+
},
|
| 26 |
+
"8": {
|
| 27 |
+
"BLOCK_SIZE_M": 16,
|
| 28 |
+
"BLOCK_SIZE_N": 64,
|
| 29 |
+
"BLOCK_SIZE_K": 128,
|
| 30 |
+
"GROUP_SIZE_M": 16,
|
| 31 |
+
"num_warps": 4,
|
| 32 |
+
"num_stages": 3
|
| 33 |
+
},
|
| 34 |
+
"16": {
|
| 35 |
+
"BLOCK_SIZE_M": 16,
|
| 36 |
+
"BLOCK_SIZE_N": 64,
|
| 37 |
+
"BLOCK_SIZE_K": 128,
|
| 38 |
+
"GROUP_SIZE_M": 64,
|
| 39 |
+
"num_warps": 4,
|
| 40 |
+
"num_stages": 4
|
| 41 |
+
},
|
| 42 |
+
"24": {
|
| 43 |
+
"BLOCK_SIZE_M": 32,
|
| 44 |
+
"BLOCK_SIZE_N": 64,
|
| 45 |
+
"BLOCK_SIZE_K": 128,
|
| 46 |
+
"GROUP_SIZE_M": 64,
|
| 47 |
+
"num_warps": 4,
|
| 48 |
+
"num_stages": 3
|
| 49 |
+
},
|
| 50 |
+
"32": {
|
| 51 |
+
"BLOCK_SIZE_M": 32,
|
| 52 |
+
"BLOCK_SIZE_N": 64,
|
| 53 |
+
"BLOCK_SIZE_K": 128,
|
| 54 |
+
"GROUP_SIZE_M": 16,
|
| 55 |
+
"num_warps": 4,
|
| 56 |
+
"num_stages": 4
|
| 57 |
+
},
|
| 58 |
+
"48": {
|
| 59 |
+
"BLOCK_SIZE_M": 64,
|
| 60 |
+
"BLOCK_SIZE_N": 64,
|
| 61 |
+
"BLOCK_SIZE_K": 128,
|
| 62 |
+
"GROUP_SIZE_M": 1,
|
| 63 |
+
"num_warps": 4,
|
| 64 |
+
"num_stages": 3
|
| 65 |
+
},
|
| 66 |
+
"64": {
|
| 67 |
+
"BLOCK_SIZE_M": 64,
|
| 68 |
+
"BLOCK_SIZE_N": 64,
|
| 69 |
+
"BLOCK_SIZE_K": 128,
|
| 70 |
+
"GROUP_SIZE_M": 32,
|
| 71 |
+
"num_warps": 4,
|
| 72 |
+
"num_stages": 3
|
| 73 |
+
},
|
| 74 |
+
"96": {
|
| 75 |
+
"BLOCK_SIZE_M": 32,
|
| 76 |
+
"BLOCK_SIZE_N": 256,
|
| 77 |
+
"BLOCK_SIZE_K": 128,
|
| 78 |
+
"GROUP_SIZE_M": 64,
|
| 79 |
+
"num_warps": 4,
|
| 80 |
+
"num_stages": 3
|
| 81 |
+
},
|
| 82 |
+
"128": {
|
| 83 |
+
"BLOCK_SIZE_M": 64,
|
| 84 |
+
"BLOCK_SIZE_N": 128,
|
| 85 |
+
"BLOCK_SIZE_K": 128,
|
| 86 |
+
"GROUP_SIZE_M": 32,
|
| 87 |
+
"num_warps": 4,
|
| 88 |
+
"num_stages": 3
|
| 89 |
+
},
|
| 90 |
+
"256": {
|
| 91 |
+
"BLOCK_SIZE_M": 64,
|
| 92 |
+
"BLOCK_SIZE_N": 256,
|
| 93 |
+
"BLOCK_SIZE_K": 128,
|
| 94 |
+
"GROUP_SIZE_M": 32,
|
| 95 |
+
"num_warps": 8,
|
| 96 |
+
"num_stages": 3
|
| 97 |
+
},
|
| 98 |
+
"512": {
|
| 99 |
+
"BLOCK_SIZE_M": 64,
|
| 100 |
+
"BLOCK_SIZE_N": 128,
|
| 101 |
+
"BLOCK_SIZE_K": 128,
|
| 102 |
+
"GROUP_SIZE_M": 64,
|
| 103 |
+
"num_warps": 4,
|
| 104 |
+
"num_stages": 3
|
| 105 |
+
},
|
| 106 |
+
"1024": {
|
| 107 |
+
"BLOCK_SIZE_M": 64,
|
| 108 |
+
"BLOCK_SIZE_N": 128,
|
| 109 |
+
"BLOCK_SIZE_K": 128,
|
| 110 |
+
"GROUP_SIZE_M": 32,
|
| 111 |
+
"num_warps": 4,
|
| 112 |
+
"num_stages": 3
|
| 113 |
+
},
|
| 114 |
+
"1536": {
|
| 115 |
+
"BLOCK_SIZE_M": 64,
|
| 116 |
+
"BLOCK_SIZE_N": 128,
|
| 117 |
+
"BLOCK_SIZE_K": 128,
|
| 118 |
+
"GROUP_SIZE_M": 32,
|
| 119 |
+
"num_warps": 4,
|
| 120 |
+
"num_stages": 3
|
| 121 |
+
},
|
| 122 |
+
"2048": {
|
| 123 |
+
"BLOCK_SIZE_M": 64,
|
| 124 |
+
"BLOCK_SIZE_N": 128,
|
| 125 |
+
"BLOCK_SIZE_K": 128,
|
| 126 |
+
"GROUP_SIZE_M": 32,
|
| 127 |
+
"num_warps": 4,
|
| 128 |
+
"num_stages": 3
|
| 129 |
+
},
|
| 130 |
+
"3072": {
|
| 131 |
+
"BLOCK_SIZE_M": 64,
|
| 132 |
+
"BLOCK_SIZE_N": 128,
|
| 133 |
+
"BLOCK_SIZE_K": 128,
|
| 134 |
+
"GROUP_SIZE_M": 64,
|
| 135 |
+
"num_warps": 4,
|
| 136 |
+
"num_stages": 3
|
| 137 |
+
},
|
| 138 |
+
"4096": {
|
| 139 |
+
"BLOCK_SIZE_M": 64,
|
| 140 |
+
"BLOCK_SIZE_N": 128,
|
| 141 |
+
"BLOCK_SIZE_K": 128,
|
| 142 |
+
"GROUP_SIZE_M": 64,
|
| 143 |
+
"num_warps": 4,
|
| 144 |
+
"num_stages": 3
|
| 145 |
+
}
|
| 146 |
+
}
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"1": {
|
| 3 |
+
"BLOCK_SIZE_M": 64,
|
| 4 |
+
"BLOCK_SIZE_N": 256,
|
| 5 |
+
"BLOCK_SIZE_K": 128,
|
| 6 |
+
"GROUP_SIZE_M": 1,
|
| 7 |
+
"num_warps": 8,
|
| 8 |
+
"num_stages": 4
|
| 9 |
+
},
|
| 10 |
+
"2": {
|
| 11 |
+
"BLOCK_SIZE_M": 64,
|
| 12 |
+
"BLOCK_SIZE_N": 256,
|
| 13 |
+
"BLOCK_SIZE_K": 128,
|
| 14 |
+
"GROUP_SIZE_M": 1,
|
| 15 |
+
"num_warps": 8,
|
| 16 |
+
"num_stages": 4
|
| 17 |
+
},
|
| 18 |
+
"4": {
|
| 19 |
+
"BLOCK_SIZE_M": 64,
|
| 20 |
+
"BLOCK_SIZE_N": 128,
|
| 21 |
+
"BLOCK_SIZE_K": 128,
|
| 22 |
+
"GROUP_SIZE_M": 1,
|
| 23 |
+
"num_warps": 4,
|
| 24 |
+
"num_stages": 3
|
| 25 |
+
},
|
| 26 |
+
"8": {
|
| 27 |
+
"BLOCK_SIZE_M": 64,
|
| 28 |
+
"BLOCK_SIZE_N": 64,
|
| 29 |
+
"BLOCK_SIZE_K": 128,
|
| 30 |
+
"GROUP_SIZE_M": 32,
|
| 31 |
+
"num_warps": 4,
|
| 32 |
+
"num_stages": 3
|
| 33 |
+
},
|
| 34 |
+
"16": {
|
| 35 |
+
"BLOCK_SIZE_M": 64,
|
| 36 |
+
"BLOCK_SIZE_N": 64,
|
| 37 |
+
"BLOCK_SIZE_K": 128,
|
| 38 |
+
"GROUP_SIZE_M": 16,
|
| 39 |
+
"num_warps": 4,
|
| 40 |
+
"num_stages": 3
|
| 41 |
+
},
|
| 42 |
+
"24": {
|
| 43 |
+
"BLOCK_SIZE_M": 64,
|
| 44 |
+
"BLOCK_SIZE_N": 64,
|
| 45 |
+
"BLOCK_SIZE_K": 128,
|
| 46 |
+
"GROUP_SIZE_M": 32,
|
| 47 |
+
"num_warps": 4,
|
| 48 |
+
"num_stages": 3
|
| 49 |
+
},
|
| 50 |
+
"32": {
|
| 51 |
+
"BLOCK_SIZE_M": 64,
|
| 52 |
+
"BLOCK_SIZE_N": 64,
|
| 53 |
+
"BLOCK_SIZE_K": 128,
|
| 54 |
+
"GROUP_SIZE_M": 32,
|
| 55 |
+
"num_warps": 4,
|
| 56 |
+
"num_stages": 3
|
| 57 |
+
},
|
| 58 |
+
"48": {
|
| 59 |
+
"BLOCK_SIZE_M": 64,
|
| 60 |
+
"BLOCK_SIZE_N": 64,
|
| 61 |
+
"BLOCK_SIZE_K": 128,
|
| 62 |
+
"GROUP_SIZE_M": 16,
|
| 63 |
+
"num_warps": 4,
|
| 64 |
+
"num_stages": 3
|
| 65 |
+
},
|
| 66 |
+
"64": {
|
| 67 |
+
"BLOCK_SIZE_M": 64,
|
| 68 |
+
"BLOCK_SIZE_N": 64,
|
| 69 |
+
"BLOCK_SIZE_K": 128,
|
| 70 |
+
"GROUP_SIZE_M": 32,
|
| 71 |
+
"num_warps": 4,
|
| 72 |
+
"num_stages": 3
|
| 73 |
+
},
|
| 74 |
+
"96": {
|
| 75 |
+
"BLOCK_SIZE_M": 64,
|
| 76 |
+
"BLOCK_SIZE_N": 64,
|
| 77 |
+
"BLOCK_SIZE_K": 128,
|
| 78 |
+
"GROUP_SIZE_M": 16,
|
| 79 |
+
"num_warps": 4,
|
| 80 |
+
"num_stages": 3
|
| 81 |
+
},
|
| 82 |
+
"128": {
|
| 83 |
+
"BLOCK_SIZE_M": 64,
|
| 84 |
+
"BLOCK_SIZE_N": 64,
|
| 85 |
+
"BLOCK_SIZE_K": 128,
|
| 86 |
+
"GROUP_SIZE_M": 64,
|
| 87 |
+
"num_warps": 4,
|
| 88 |
+
"num_stages": 3
|
| 89 |
+
},
|
| 90 |
+
"256": {
|
| 91 |
+
"BLOCK_SIZE_M": 64,
|
| 92 |
+
"BLOCK_SIZE_N": 128,
|
| 93 |
+
"BLOCK_SIZE_K": 128,
|
| 94 |
+
"GROUP_SIZE_M": 32,
|
| 95 |
+
"num_warps": 4,
|
| 96 |
+
"num_stages": 3
|
| 97 |
+
},
|
| 98 |
+
"512": {
|
| 99 |
+
"BLOCK_SIZE_M": 64,
|
| 100 |
+
"BLOCK_SIZE_N": 128,
|
| 101 |
+
"BLOCK_SIZE_K": 128,
|
| 102 |
+
"GROUP_SIZE_M": 32,
|
| 103 |
+
"num_warps": 4,
|
| 104 |
+
"num_stages": 3
|
| 105 |
+
},
|
| 106 |
+
"1024": {
|
| 107 |
+
"BLOCK_SIZE_M": 64,
|
| 108 |
+
"BLOCK_SIZE_N": 128,
|
| 109 |
+
"BLOCK_SIZE_K": 128,
|
| 110 |
+
"GROUP_SIZE_M": 32,
|
| 111 |
+
"num_warps": 4,
|
| 112 |
+
"num_stages": 3
|
| 113 |
+
},
|
| 114 |
+
"1536": {
|
| 115 |
+
"BLOCK_SIZE_M": 64,
|
| 116 |
+
"BLOCK_SIZE_N": 128,
|
| 117 |
+
"BLOCK_SIZE_K": 128,
|
| 118 |
+
"GROUP_SIZE_M": 64,
|
| 119 |
+
"num_warps": 4,
|
| 120 |
+
"num_stages": 3
|
| 121 |
+
},
|
| 122 |
+
"2048": {
|
| 123 |
+
"BLOCK_SIZE_M": 64,
|
| 124 |
+
"BLOCK_SIZE_N": 128,
|
| 125 |
+
"BLOCK_SIZE_K": 128,
|
| 126 |
+
"GROUP_SIZE_M": 32,
|
| 127 |
+
"num_warps": 4,
|
| 128 |
+
"num_stages": 3
|
| 129 |
+
},
|
| 130 |
+
"3072": {
|
| 131 |
+
"BLOCK_SIZE_M": 64,
|
| 132 |
+
"BLOCK_SIZE_N": 128,
|
| 133 |
+
"BLOCK_SIZE_K": 128,
|
| 134 |
+
"GROUP_SIZE_M": 32,
|
| 135 |
+
"num_warps": 4,
|
| 136 |
+
"num_stages": 3
|
| 137 |
+
},
|
| 138 |
+
"4096": {
|
| 139 |
+
"BLOCK_SIZE_M": 64,
|
| 140 |
+
"BLOCK_SIZE_N": 128,
|
| 141 |
+
"BLOCK_SIZE_K": 128,
|
| 142 |
+
"GROUP_SIZE_M": 16,
|
| 143 |
+
"num_warps": 4,
|
| 144 |
+
"num_stages": 3
|
| 145 |
+
}
|
| 146 |
+
}
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"1": {
|
| 3 |
+
"BLOCK_SIZE_M": 64,
|
| 4 |
+
"BLOCK_SIZE_N": 256,
|
| 5 |
+
"BLOCK_SIZE_K": 128,
|
| 6 |
+
"GROUP_SIZE_M": 1,
|
| 7 |
+
"num_warps": 8,
|
| 8 |
+
"num_stages": 4
|
| 9 |
+
},
|
| 10 |
+
"2": {
|
| 11 |
+
"BLOCK_SIZE_M": 64,
|
| 12 |
+
"BLOCK_SIZE_N": 256,
|
| 13 |
+
"BLOCK_SIZE_K": 128,
|
| 14 |
+
"GROUP_SIZE_M": 64,
|
| 15 |
+
"num_warps": 8,
|
| 16 |
+
"num_stages": 4
|
| 17 |
+
},
|
| 18 |
+
"4": {
|
| 19 |
+
"BLOCK_SIZE_M": 64,
|
| 20 |
+
"BLOCK_SIZE_N": 128,
|
| 21 |
+
"BLOCK_SIZE_K": 128,
|
| 22 |
+
"GROUP_SIZE_M": 16,
|
| 23 |
+
"num_warps": 4,
|
| 24 |
+
"num_stages": 3
|
| 25 |
+
},
|
| 26 |
+
"8": {
|
| 27 |
+
"BLOCK_SIZE_M": 64,
|
| 28 |
+
"BLOCK_SIZE_N": 128,
|
| 29 |
+
"BLOCK_SIZE_K": 128,
|
| 30 |
+
"GROUP_SIZE_M": 32,
|
| 31 |
+
"num_warps": 4,
|
| 32 |
+
"num_stages": 3
|
| 33 |
+
},
|
| 34 |
+
"16": {
|
| 35 |
+
"BLOCK_SIZE_M": 64,
|
| 36 |
+
"BLOCK_SIZE_N": 64,
|
| 37 |
+
"BLOCK_SIZE_K": 128,
|
| 38 |
+
"GROUP_SIZE_M": 1,
|
| 39 |
+
"num_warps": 4,
|
| 40 |
+
"num_stages": 3
|
| 41 |
+
},
|
| 42 |
+
"24": {
|
| 43 |
+
"BLOCK_SIZE_M": 64,
|
| 44 |
+
"BLOCK_SIZE_N": 64,
|
| 45 |
+
"BLOCK_SIZE_K": 128,
|
| 46 |
+
"GROUP_SIZE_M": 16,
|
| 47 |
+
"num_warps": 4,
|
| 48 |
+
"num_stages": 3
|
| 49 |
+
},
|
| 50 |
+
"32": {
|
| 51 |
+
"BLOCK_SIZE_M": 64,
|
| 52 |
+
"BLOCK_SIZE_N": 64,
|
| 53 |
+
"BLOCK_SIZE_K": 128,
|
| 54 |
+
"GROUP_SIZE_M": 16,
|
| 55 |
+
"num_warps": 4,
|
| 56 |
+
"num_stages": 3
|
| 57 |
+
},
|
| 58 |
+
"48": {
|
| 59 |
+
"BLOCK_SIZE_M": 64,
|
| 60 |
+
"BLOCK_SIZE_N": 64,
|
| 61 |
+
"BLOCK_SIZE_K": 128,
|
| 62 |
+
"GROUP_SIZE_M": 32,
|
| 63 |
+
"num_warps": 4,
|
| 64 |
+
"num_stages": 3
|
| 65 |
+
},
|
| 66 |
+
"64": {
|
| 67 |
+
"BLOCK_SIZE_M": 64,
|
| 68 |
+
"BLOCK_SIZE_N": 64,
|
| 69 |
+
"BLOCK_SIZE_K": 128,
|
| 70 |
+
"GROUP_SIZE_M": 32,
|
| 71 |
+
"num_warps": 4,
|
| 72 |
+
"num_stages": 3
|
| 73 |
+
},
|
| 74 |
+
"96": {
|
| 75 |
+
"BLOCK_SIZE_M": 64,
|
| 76 |
+
"BLOCK_SIZE_N": 64,
|
| 77 |
+
"BLOCK_SIZE_K": 128,
|
| 78 |
+
"GROUP_SIZE_M": 64,
|
| 79 |
+
"num_warps": 4,
|
| 80 |
+
"num_stages": 3
|
| 81 |
+
},
|
| 82 |
+
"128": {
|
| 83 |
+
"BLOCK_SIZE_M": 64,
|
| 84 |
+
"BLOCK_SIZE_N": 64,
|
| 85 |
+
"BLOCK_SIZE_K": 128,
|
| 86 |
+
"GROUP_SIZE_M": 32,
|
| 87 |
+
"num_warps": 4,
|
| 88 |
+
"num_stages": 3
|
| 89 |
+
},
|
| 90 |
+
"256": {
|
| 91 |
+
"BLOCK_SIZE_M": 64,
|
| 92 |
+
"BLOCK_SIZE_N": 128,
|
| 93 |
+
"BLOCK_SIZE_K": 128,
|
| 94 |
+
"GROUP_SIZE_M": 16,
|
| 95 |
+
"num_warps": 4,
|
| 96 |
+
"num_stages": 3
|
| 97 |
+
},
|
| 98 |
+
"512": {
|
| 99 |
+
"BLOCK_SIZE_M": 64,
|
| 100 |
+
"BLOCK_SIZE_N": 128,
|
| 101 |
+
"BLOCK_SIZE_K": 128,
|
| 102 |
+
"GROUP_SIZE_M": 16,
|
| 103 |
+
"num_warps": 4,
|
| 104 |
+
"num_stages": 3
|
| 105 |
+
},
|
| 106 |
+
"1024": {
|
| 107 |
+
"BLOCK_SIZE_M": 64,
|
| 108 |
+
"BLOCK_SIZE_N": 128,
|
| 109 |
+
"BLOCK_SIZE_K": 128,
|
| 110 |
+
"GROUP_SIZE_M": 16,
|
| 111 |
+
"num_warps": 4,
|
| 112 |
+
"num_stages": 3
|
| 113 |
+
},
|
| 114 |
+
"1536": {
|
| 115 |
+
"BLOCK_SIZE_M": 64,
|
| 116 |
+
"BLOCK_SIZE_N": 128,
|
| 117 |
+
"BLOCK_SIZE_K": 128,
|
| 118 |
+
"GROUP_SIZE_M": 64,
|
| 119 |
+
"num_warps": 4,
|
| 120 |
+
"num_stages": 3
|
| 121 |
+
},
|
| 122 |
+
"2048": {
|
| 123 |
+
"BLOCK_SIZE_M": 64,
|
| 124 |
+
"BLOCK_SIZE_N": 128,
|
| 125 |
+
"BLOCK_SIZE_K": 128,
|
| 126 |
+
"GROUP_SIZE_M": 64,
|
| 127 |
+
"num_warps": 4,
|
| 128 |
+
"num_stages": 3
|
| 129 |
+
},
|
| 130 |
+
"3072": {
|
| 131 |
+
"BLOCK_SIZE_M": 64,
|
| 132 |
+
"BLOCK_SIZE_N": 128,
|
| 133 |
+
"BLOCK_SIZE_K": 128,
|
| 134 |
+
"GROUP_SIZE_M": 64,
|
| 135 |
+
"num_warps": 4,
|
| 136 |
+
"num_stages": 3
|
| 137 |
+
},
|
| 138 |
+
"4096": {
|
| 139 |
+
"BLOCK_SIZE_M": 64,
|
| 140 |
+
"BLOCK_SIZE_N": 128,
|
| 141 |
+
"BLOCK_SIZE_K": 128,
|
| 142 |
+
"GROUP_SIZE_M": 64,
|
| 143 |
+
"num_warps": 4,
|
| 144 |
+
"num_stages": 3
|
| 145 |
+
}
|
| 146 |
+
}
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"1": {
|
| 3 |
+
"BLOCK_SIZE_M": 64,
|
| 4 |
+
"BLOCK_SIZE_N": 32,
|
| 5 |
+
"BLOCK_SIZE_K": 128,
|
| 6 |
+
"GROUP_SIZE_M": 16,
|
| 7 |
+
"num_warps": 4,
|
| 8 |
+
"num_stages": 4
|
| 9 |
+
},
|
| 10 |
+
"2": {
|
| 11 |
+
"BLOCK_SIZE_M": 64,
|
| 12 |
+
"BLOCK_SIZE_N": 32,
|
| 13 |
+
"BLOCK_SIZE_K": 128,
|
| 14 |
+
"GROUP_SIZE_M": 16,
|
| 15 |
+
"num_warps": 4,
|
| 16 |
+
"num_stages": 4
|
| 17 |
+
},
|
| 18 |
+
"4": {
|
| 19 |
+
"BLOCK_SIZE_M": 64,
|
| 20 |
+
"BLOCK_SIZE_N": 32,
|
| 21 |
+
"BLOCK_SIZE_K": 128,
|
| 22 |
+
"GROUP_SIZE_M": 16,
|
| 23 |
+
"num_warps": 4,
|
| 24 |
+
"num_stages": 4
|
| 25 |
+
},
|
| 26 |
+
"8": {
|
| 27 |
+
"BLOCK_SIZE_M": 64,
|
| 28 |
+
"BLOCK_SIZE_N": 32,
|
| 29 |
+
"BLOCK_SIZE_K": 128,
|
| 30 |
+
"GROUP_SIZE_M": 1,
|
| 31 |
+
"num_warps": 4,
|
| 32 |
+
"num_stages": 5
|
| 33 |
+
},
|
| 34 |
+
"16": {
|
| 35 |
+
"BLOCK_SIZE_M": 64,
|
| 36 |
+
"BLOCK_SIZE_N": 32,
|
| 37 |
+
"BLOCK_SIZE_K": 128,
|
| 38 |
+
"GROUP_SIZE_M": 64,
|
| 39 |
+
"num_warps": 4,
|
| 40 |
+
"num_stages": 4
|
| 41 |
+
},
|
| 42 |
+
"24": {
|
| 43 |
+
"BLOCK_SIZE_M": 64,
|
| 44 |
+
"BLOCK_SIZE_N": 32,
|
| 45 |
+
"BLOCK_SIZE_K": 128,
|
| 46 |
+
"GROUP_SIZE_M": 16,
|
| 47 |
+
"num_warps": 4,
|
| 48 |
+
"num_stages": 4
|
| 49 |
+
},
|
| 50 |
+
"32": {
|
| 51 |
+
"BLOCK_SIZE_M": 64,
|
| 52 |
+
"BLOCK_SIZE_N": 32,
|
| 53 |
+
"BLOCK_SIZE_K": 128,
|
| 54 |
+
"GROUP_SIZE_M": 1,
|
| 55 |
+
"num_warps": 4,
|
| 56 |
+
"num_stages": 5
|
| 57 |
+
},
|
| 58 |
+
"48": {
|
| 59 |
+
"BLOCK_SIZE_M": 64,
|
| 60 |
+
"BLOCK_SIZE_N": 32,
|
| 61 |
+
"BLOCK_SIZE_K": 128,
|
| 62 |
+
"GROUP_SIZE_M": 1,
|
| 63 |
+
"num_warps": 4,
|
| 64 |
+
"num_stages": 5
|
| 65 |
+
},
|
| 66 |
+
"64": {
|
| 67 |
+
"BLOCK_SIZE_M": 64,
|
| 68 |
+
"BLOCK_SIZE_N": 32,
|
| 69 |
+
"BLOCK_SIZE_K": 128,
|
| 70 |
+
"GROUP_SIZE_M": 32,
|
| 71 |
+
"num_warps": 4,
|
| 72 |
+
"num_stages": 5
|
| 73 |
+
},
|
| 74 |
+
"96": {
|
| 75 |
+
"BLOCK_SIZE_M": 64,
|
| 76 |
+
"BLOCK_SIZE_N": 32,
|
| 77 |
+
"BLOCK_SIZE_K": 128,
|
| 78 |
+
"GROUP_SIZE_M": 16,
|
| 79 |
+
"num_warps": 4,
|
| 80 |
+
"num_stages": 4
|
| 81 |
+
},
|
| 82 |
+
"128": {
|
| 83 |
+
"BLOCK_SIZE_M": 64,
|
| 84 |
+
"BLOCK_SIZE_N": 32,
|
| 85 |
+
"BLOCK_SIZE_K": 128,
|
| 86 |
+
"GROUP_SIZE_M": 1,
|
| 87 |
+
"num_warps": 4,
|
| 88 |
+
"num_stages": 4
|
| 89 |
+
},
|
| 90 |
+
"256": {
|
| 91 |
+
"BLOCK_SIZE_M": 64,
|
| 92 |
+
"BLOCK_SIZE_N": 32,
|
| 93 |
+
"BLOCK_SIZE_K": 128,
|
| 94 |
+
"GROUP_SIZE_M": 16,
|
| 95 |
+
"num_warps": 4,
|
| 96 |
+
"num_stages": 5
|
| 97 |
+
},
|
| 98 |
+
"512": {
|
| 99 |
+
"BLOCK_SIZE_M": 64,
|
| 100 |
+
"BLOCK_SIZE_N": 32,
|
| 101 |
+
"BLOCK_SIZE_K": 128,
|
| 102 |
+
"GROUP_SIZE_M": 16,
|
| 103 |
+
"num_warps": 4,
|
| 104 |
+
"num_stages": 4
|
| 105 |
+
},
|
| 106 |
+
"1024": {
|
| 107 |
+
"BLOCK_SIZE_M": 64,
|
| 108 |
+
"BLOCK_SIZE_N": 32,
|
| 109 |
+
"BLOCK_SIZE_K": 128,
|
| 110 |
+
"GROUP_SIZE_M": 16,
|
| 111 |
+
"num_warps": 4,
|
| 112 |
+
"num_stages": 4
|
| 113 |
+
},
|
| 114 |
+
"1536": {
|
| 115 |
+
"BLOCK_SIZE_M": 64,
|
| 116 |
+
"BLOCK_SIZE_N": 64,
|
| 117 |
+
"BLOCK_SIZE_K": 128,
|
| 118 |
+
"GROUP_SIZE_M": 32,
|
| 119 |
+
"num_warps": 4,
|
| 120 |
+
"num_stages": 4
|
| 121 |
+
},
|
| 122 |
+
"2048": {
|
| 123 |
+
"BLOCK_SIZE_M": 64,
|
| 124 |
+
"BLOCK_SIZE_N": 64,
|
| 125 |
+
"BLOCK_SIZE_K": 128,
|
| 126 |
+
"GROUP_SIZE_M": 1,
|
| 127 |
+
"num_warps": 4,
|
| 128 |
+
"num_stages": 5
|
| 129 |
+
},
|
| 130 |
+
"3072": {
|
| 131 |
+
"BLOCK_SIZE_M": 64,
|
| 132 |
+
"BLOCK_SIZE_N": 32,
|
| 133 |
+
"BLOCK_SIZE_K": 128,
|
| 134 |
+
"GROUP_SIZE_M": 16,
|
| 135 |
+
"num_warps": 4,
|
| 136 |
+
"num_stages": 3
|
| 137 |
+
},
|
| 138 |
+
"4096": {
|
| 139 |
+
"BLOCK_SIZE_M": 64,
|
| 140 |
+
"BLOCK_SIZE_N": 64,
|
| 141 |
+
"BLOCK_SIZE_K": 128,
|
| 142 |
+
"GROUP_SIZE_M": 1,
|
| 143 |
+
"num_warps": 4,
|
| 144 |
+
"num_stages": 4
|
| 145 |
+
}
|
| 146 |
+
}
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"1": {
|
| 3 |
+
"BLOCK_SIZE_M": 16,
|
| 4 |
+
"BLOCK_SIZE_N": 32,
|
| 5 |
+
"BLOCK_SIZE_K": 128,
|
| 6 |
+
"GROUP_SIZE_M": 64,
|
| 7 |
+
"num_warps": 4,
|
| 8 |
+
"num_stages": 3
|
| 9 |
+
},
|
| 10 |
+
"2": {
|
| 11 |
+
"BLOCK_SIZE_M": 16,
|
| 12 |
+
"BLOCK_SIZE_N": 32,
|
| 13 |
+
"BLOCK_SIZE_K": 128,
|
| 14 |
+
"GROUP_SIZE_M": 64,
|
| 15 |
+
"num_warps": 4,
|
| 16 |
+
"num_stages": 5
|
| 17 |
+
},
|
| 18 |
+
"4": {
|
| 19 |
+
"BLOCK_SIZE_M": 16,
|
| 20 |
+
"BLOCK_SIZE_N": 32,
|
| 21 |
+
"BLOCK_SIZE_K": 128,
|
| 22 |
+
"GROUP_SIZE_M": 64,
|
| 23 |
+
"num_warps": 4,
|
| 24 |
+
"num_stages": 5
|
| 25 |
+
},
|
| 26 |
+
"8": {
|
| 27 |
+
"BLOCK_SIZE_M": 16,
|
| 28 |
+
"BLOCK_SIZE_N": 32,
|
| 29 |
+
"BLOCK_SIZE_K": 128,
|
| 30 |
+
"GROUP_SIZE_M": 64,
|
| 31 |
+
"num_warps": 4,
|
| 32 |
+
"num_stages": 4
|
| 33 |
+
},
|
| 34 |
+
"16": {
|
| 35 |
+
"BLOCK_SIZE_M": 16,
|
| 36 |
+
"BLOCK_SIZE_N": 32,
|
| 37 |
+
"BLOCK_SIZE_K": 128,
|
| 38 |
+
"GROUP_SIZE_M": 16,
|
| 39 |
+
"num_warps": 4,
|
| 40 |
+
"num_stages": 3
|
| 41 |
+
},
|
| 42 |
+
"24": {
|
| 43 |
+
"BLOCK_SIZE_M": 32,
|
| 44 |
+
"BLOCK_SIZE_N": 32,
|
| 45 |
+
"BLOCK_SIZE_K": 128,
|
| 46 |
+
"GROUP_SIZE_M": 1,
|
| 47 |
+
"num_warps": 4,
|
| 48 |
+
"num_stages": 4
|
| 49 |
+
},
|
| 50 |
+
"32": {
|
| 51 |
+
"BLOCK_SIZE_M": 32,
|
| 52 |
+
"BLOCK_SIZE_N": 32,
|
| 53 |
+
"BLOCK_SIZE_K": 128,
|
| 54 |
+
"GROUP_SIZE_M": 64,
|
| 55 |
+
"num_warps": 4,
|
| 56 |
+
"num_stages": 3
|
| 57 |
+
},
|
| 58 |
+
"48": {
|
| 59 |
+
"BLOCK_SIZE_M": 16,
|
| 60 |
+
"BLOCK_SIZE_N": 64,
|
| 61 |
+
"BLOCK_SIZE_K": 128,
|
| 62 |
+
"GROUP_SIZE_M": 32,
|
| 63 |
+
"num_warps": 4,
|
| 64 |
+
"num_stages": 3
|
| 65 |
+
},
|
| 66 |
+
"64": {
|
| 67 |
+
"BLOCK_SIZE_M": 32,
|
| 68 |
+
"BLOCK_SIZE_N": 64,
|
| 69 |
+
"BLOCK_SIZE_K": 128,
|
| 70 |
+
"GROUP_SIZE_M": 16,
|
| 71 |
+
"num_warps": 8,
|
| 72 |
+
"num_stages": 3
|
| 73 |
+
},
|
| 74 |
+
"96": {
|
| 75 |
+
"BLOCK_SIZE_M": 32,
|
| 76 |
+
"BLOCK_SIZE_N": 64,
|
| 77 |
+
"BLOCK_SIZE_K": 128,
|
| 78 |
+
"GROUP_SIZE_M": 16,
|
| 79 |
+
"num_warps": 8,
|
| 80 |
+
"num_stages": 5
|
| 81 |
+
},
|
| 82 |
+
"128": {
|
| 83 |
+
"BLOCK_SIZE_M": 64,
|
| 84 |
+
"BLOCK_SIZE_N": 64,
|
| 85 |
+
"BLOCK_SIZE_K": 128,
|
| 86 |
+
"GROUP_SIZE_M": 32,
|
| 87 |
+
"num_warps": 8,
|
| 88 |
+
"num_stages": 5
|
| 89 |
+
},
|
| 90 |
+
"256": {
|
| 91 |
+
"BLOCK_SIZE_M": 64,
|
| 92 |
+
"BLOCK_SIZE_N": 128,
|
| 93 |
+
"BLOCK_SIZE_K": 128,
|
| 94 |
+
"GROUP_SIZE_M": 32,
|
| 95 |
+
"num_warps": 8,
|
| 96 |
+
"num_stages": 5
|
| 97 |
+
},
|
| 98 |
+
"512": {
|
| 99 |
+
"BLOCK_SIZE_M": 128,
|
| 100 |
+
"BLOCK_SIZE_N": 128,
|
| 101 |
+
"BLOCK_SIZE_K": 128,
|
| 102 |
+
"GROUP_SIZE_M": 64,
|
| 103 |
+
"num_warps": 8,
|
| 104 |
+
"num_stages": 2
|
| 105 |
+
},
|
| 106 |
+
"1024": {
|
| 107 |
+
"BLOCK_SIZE_M": 64,
|
| 108 |
+
"BLOCK_SIZE_N": 128,
|
| 109 |
+
"BLOCK_SIZE_K": 128,
|
| 110 |
+
"GROUP_SIZE_M": 16,
|
| 111 |
+
"num_warps": 4,
|
| 112 |
+
"num_stages": 4
|
| 113 |
+
},
|
| 114 |
+
"1536": {
|
| 115 |
+
"BLOCK_SIZE_M": 128,
|
| 116 |
+
"BLOCK_SIZE_N": 128,
|
| 117 |
+
"BLOCK_SIZE_K": 128,
|
| 118 |
+
"GROUP_SIZE_M": 32,
|
| 119 |
+
"num_warps": 8,
|
| 120 |
+
"num_stages": 2
|
| 121 |
+
},
|
| 122 |
+
"2048": {
|
| 123 |
+
"BLOCK_SIZE_M": 128,
|
| 124 |
+
"BLOCK_SIZE_N": 128,
|
| 125 |
+
"BLOCK_SIZE_K": 128,
|
| 126 |
+
"GROUP_SIZE_M": 32,
|
| 127 |
+
"num_warps": 8,
|
| 128 |
+
"num_stages": 2
|
| 129 |
+
},
|
| 130 |
+
"3072": {
|
| 131 |
+
"BLOCK_SIZE_M": 128,
|
| 132 |
+
"BLOCK_SIZE_N": 128,
|
| 133 |
+
"BLOCK_SIZE_K": 128,
|
| 134 |
+
"GROUP_SIZE_M": 64,
|
| 135 |
+
"num_warps": 8,
|
| 136 |
+
"num_stages": 2
|
| 137 |
+
},
|
| 138 |
+
"4096": {
|
| 139 |
+
"BLOCK_SIZE_M": 64,
|
| 140 |
+
"BLOCK_SIZE_N": 128,
|
| 141 |
+
"BLOCK_SIZE_K": 128,
|
| 142 |
+
"GROUP_SIZE_M": 32,
|
| 143 |
+
"num_warps": 4,
|
| 144 |
+
"num_stages": 3
|
| 145 |
+
}
|
| 146 |
+
}
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"1": {
|
| 3 |
+
"BLOCK_SIZE_M": 16,
|
| 4 |
+
"BLOCK_SIZE_N": 32,
|
| 5 |
+
"BLOCK_SIZE_K": 128,
|
| 6 |
+
"GROUP_SIZE_M": 32,
|
| 7 |
+
"num_warps": 4,
|
| 8 |
+
"num_stages": 4
|
| 9 |
+
},
|
| 10 |
+
"2": {
|
| 11 |
+
"BLOCK_SIZE_M": 64,
|
| 12 |
+
"BLOCK_SIZE_N": 64,
|
| 13 |
+
"BLOCK_SIZE_K": 128,
|
| 14 |
+
"GROUP_SIZE_M": 1,
|
| 15 |
+
"num_warps": 4,
|
| 16 |
+
"num_stages": 4
|
| 17 |
+
},
|
| 18 |
+
"4": {
|
| 19 |
+
"BLOCK_SIZE_M": 64,
|
| 20 |
+
"BLOCK_SIZE_N": 32,
|
| 21 |
+
"BLOCK_SIZE_K": 128,
|
| 22 |
+
"GROUP_SIZE_M": 64,
|
| 23 |
+
"num_warps": 4,
|
| 24 |
+
"num_stages": 4
|
| 25 |
+
},
|
| 26 |
+
"8": {
|
| 27 |
+
"BLOCK_SIZE_M": 64,
|
| 28 |
+
"BLOCK_SIZE_N": 32,
|
| 29 |
+
"BLOCK_SIZE_K": 128,
|
| 30 |
+
"GROUP_SIZE_M": 1,
|
| 31 |
+
"num_warps": 4,
|
| 32 |
+
"num_stages": 4
|
| 33 |
+
},
|
| 34 |
+
"16": {
|
| 35 |
+
"BLOCK_SIZE_M": 64,
|
| 36 |
+
"BLOCK_SIZE_N": 32,
|
| 37 |
+
"BLOCK_SIZE_K": 128,
|
| 38 |
+
"GROUP_SIZE_M": 16,
|
| 39 |
+
"num_warps": 4,
|
| 40 |
+
"num_stages": 4
|
| 41 |
+
},
|
| 42 |
+
"24": {
|
| 43 |
+
"BLOCK_SIZE_M": 64,
|
| 44 |
+
"BLOCK_SIZE_N": 32,
|
| 45 |
+
"BLOCK_SIZE_K": 128,
|
| 46 |
+
"GROUP_SIZE_M": 64,
|
| 47 |
+
"num_warps": 4,
|
| 48 |
+
"num_stages": 4
|
| 49 |
+
},
|
| 50 |
+
"32": {
|
| 51 |
+
"BLOCK_SIZE_M": 64,
|
| 52 |
+
"BLOCK_SIZE_N": 32,
|
| 53 |
+
"BLOCK_SIZE_K": 128,
|
| 54 |
+
"GROUP_SIZE_M": 1,
|
| 55 |
+
"num_warps": 4,
|
| 56 |
+
"num_stages": 4
|
| 57 |
+
},
|
| 58 |
+
"48": {
|
| 59 |
+
"BLOCK_SIZE_M": 64,
|
| 60 |
+
"BLOCK_SIZE_N": 32,
|
| 61 |
+
"BLOCK_SIZE_K": 128,
|
| 62 |
+
"GROUP_SIZE_M": 1,
|
| 63 |
+
"num_warps": 4,
|
| 64 |
+
"num_stages": 4
|
| 65 |
+
},
|
| 66 |
+
"64": {
|
| 67 |
+
"BLOCK_SIZE_M": 64,
|
| 68 |
+
"BLOCK_SIZE_N": 32,
|
| 69 |
+
"BLOCK_SIZE_K": 128,
|
| 70 |
+
"GROUP_SIZE_M": 1,
|
| 71 |
+
"num_warps": 4,
|
| 72 |
+
"num_stages": 4
|
| 73 |
+
},
|
| 74 |
+
"96": {
|
| 75 |
+
"BLOCK_SIZE_M": 64,
|
| 76 |
+
"BLOCK_SIZE_N": 64,
|
| 77 |
+
"BLOCK_SIZE_K": 128,
|
| 78 |
+
"GROUP_SIZE_M": 1,
|
| 79 |
+
"num_warps": 4,
|
| 80 |
+
"num_stages": 4
|
| 81 |
+
},
|
| 82 |
+
"128": {
|
| 83 |
+
"BLOCK_SIZE_M": 64,
|
| 84 |
+
"BLOCK_SIZE_N": 64,
|
| 85 |
+
"BLOCK_SIZE_K": 128,
|
| 86 |
+
"GROUP_SIZE_M": 1,
|
| 87 |
+
"num_warps": 4,
|
| 88 |
+
"num_stages": 4
|
| 89 |
+
},
|
| 90 |
+
"256": {
|
| 91 |
+
"BLOCK_SIZE_M": 64,
|
| 92 |
+
"BLOCK_SIZE_N": 32,
|
| 93 |
+
"BLOCK_SIZE_K": 128,
|
| 94 |
+
"GROUP_SIZE_M": 64,
|
| 95 |
+
"num_warps": 4,
|
| 96 |
+
"num_stages": 3
|
| 97 |
+
},
|
| 98 |
+
"512": {
|
| 99 |
+
"BLOCK_SIZE_M": 64,
|
| 100 |
+
"BLOCK_SIZE_N": 64,
|
| 101 |
+
"BLOCK_SIZE_K": 128,
|
| 102 |
+
"GROUP_SIZE_M": 16,
|
| 103 |
+
"num_warps": 4,
|
| 104 |
+
"num_stages": 3
|
| 105 |
+
},
|
| 106 |
+
"1024": {
|
| 107 |
+
"BLOCK_SIZE_M": 64,
|
| 108 |
+
"BLOCK_SIZE_N": 64,
|
| 109 |
+
"BLOCK_SIZE_K": 128,
|
| 110 |
+
"GROUP_SIZE_M": 64,
|
| 111 |
+
"num_warps": 4,
|
| 112 |
+
"num_stages": 3
|
| 113 |
+
},
|
| 114 |
+
"1536": {
|
| 115 |
+
"BLOCK_SIZE_M": 64,
|
| 116 |
+
"BLOCK_SIZE_N": 64,
|
| 117 |
+
"BLOCK_SIZE_K": 128,
|
| 118 |
+
"GROUP_SIZE_M": 16,
|
| 119 |
+
"num_warps": 4,
|
| 120 |
+
"num_stages": 3
|
| 121 |
+
},
|
| 122 |
+
"2048": {
|
| 123 |
+
"BLOCK_SIZE_M": 64,
|
| 124 |
+
"BLOCK_SIZE_N": 128,
|
| 125 |
+
"BLOCK_SIZE_K": 128,
|
| 126 |
+
"GROUP_SIZE_M": 64,
|
| 127 |
+
"num_warps": 4,
|
| 128 |
+
"num_stages": 3
|
| 129 |
+
},
|
| 130 |
+
"3072": {
|
| 131 |
+
"BLOCK_SIZE_M": 64,
|
| 132 |
+
"BLOCK_SIZE_N": 64,
|
| 133 |
+
"BLOCK_SIZE_K": 128,
|
| 134 |
+
"GROUP_SIZE_M": 16,
|
| 135 |
+
"num_warps": 4,
|
| 136 |
+
"num_stages": 3
|
| 137 |
+
},
|
| 138 |
+
"4096": {
|
| 139 |
+
"BLOCK_SIZE_M": 64,
|
| 140 |
+
"BLOCK_SIZE_N": 128,
|
| 141 |
+
"BLOCK_SIZE_K": 128,
|
| 142 |
+
"GROUP_SIZE_M": 16,
|
| 143 |
+
"num_warps": 4,
|
| 144 |
+
"num_stages": 3
|
| 145 |
+
}
|
| 146 |
+
}
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"1": {
|
| 3 |
+
"BLOCK_SIZE_M": 16,
|
| 4 |
+
"BLOCK_SIZE_N": 32,
|
| 5 |
+
"BLOCK_SIZE_K": 128,
|
| 6 |
+
"GROUP_SIZE_M": 32,
|
| 7 |
+
"num_warps": 4,
|
| 8 |
+
"num_stages": 3
|
| 9 |
+
},
|
| 10 |
+
"2": {
|
| 11 |
+
"BLOCK_SIZE_M": 16,
|
| 12 |
+
"BLOCK_SIZE_N": 32,
|
| 13 |
+
"BLOCK_SIZE_K": 128,
|
| 14 |
+
"GROUP_SIZE_M": 32,
|
| 15 |
+
"num_warps": 4,
|
| 16 |
+
"num_stages": 4
|
| 17 |
+
},
|
| 18 |
+
"4": {
|
| 19 |
+
"BLOCK_SIZE_M": 16,
|
| 20 |
+
"BLOCK_SIZE_N": 32,
|
| 21 |
+
"BLOCK_SIZE_K": 128,
|
| 22 |
+
"GROUP_SIZE_M": 16,
|
| 23 |
+
"num_warps": 4,
|
| 24 |
+
"num_stages": 5
|
| 25 |
+
},
|
| 26 |
+
"8": {
|
| 27 |
+
"BLOCK_SIZE_M": 16,
|
| 28 |
+
"BLOCK_SIZE_N": 32,
|
| 29 |
+
"BLOCK_SIZE_K": 128,
|
| 30 |
+
"GROUP_SIZE_M": 32,
|
| 31 |
+
"num_warps": 4,
|
| 32 |
+
"num_stages": 3
|
| 33 |
+
},
|
| 34 |
+
"16": {
|
| 35 |
+
"BLOCK_SIZE_M": 16,
|
| 36 |
+
"BLOCK_SIZE_N": 32,
|
| 37 |
+
"BLOCK_SIZE_K": 128,
|
| 38 |
+
"GROUP_SIZE_M": 1,
|
| 39 |
+
"num_warps": 4,
|
| 40 |
+
"num_stages": 5
|
| 41 |
+
},
|
| 42 |
+
"24": {
|
| 43 |
+
"BLOCK_SIZE_M": 32,
|
| 44 |
+
"BLOCK_SIZE_N": 32,
|
| 45 |
+
"BLOCK_SIZE_K": 128,
|
| 46 |
+
"GROUP_SIZE_M": 16,
|
| 47 |
+
"num_warps": 8,
|
| 48 |
+
"num_stages": 3
|
| 49 |
+
},
|
| 50 |
+
"32": {
|
| 51 |
+
"BLOCK_SIZE_M": 32,
|
| 52 |
+
"BLOCK_SIZE_N": 32,
|
| 53 |
+
"BLOCK_SIZE_K": 128,
|
| 54 |
+
"GROUP_SIZE_M": 64,
|
| 55 |
+
"num_warps": 8,
|
| 56 |
+
"num_stages": 5
|
| 57 |
+
},
|
| 58 |
+
"48": {
|
| 59 |
+
"BLOCK_SIZE_M": 16,
|
| 60 |
+
"BLOCK_SIZE_N": 64,
|
| 61 |
+
"BLOCK_SIZE_K": 128,
|
| 62 |
+
"GROUP_SIZE_M": 1,
|
| 63 |
+
"num_warps": 4,
|
| 64 |
+
"num_stages": 3
|
| 65 |
+
},
|
| 66 |
+
"64": {
|
| 67 |
+
"BLOCK_SIZE_M": 32,
|
| 68 |
+
"BLOCK_SIZE_N": 64,
|
| 69 |
+
"BLOCK_SIZE_K": 128,
|
| 70 |
+
"GROUP_SIZE_M": 1,
|
| 71 |
+
"num_warps": 8,
|
| 72 |
+
"num_stages": 4
|
| 73 |
+
},
|
| 74 |
+
"96": {
|
| 75 |
+
"BLOCK_SIZE_M": 32,
|
| 76 |
+
"BLOCK_SIZE_N": 64,
|
| 77 |
+
"BLOCK_SIZE_K": 128,
|
| 78 |
+
"GROUP_SIZE_M": 16,
|
| 79 |
+
"num_warps": 8,
|
| 80 |
+
"num_stages": 5
|
| 81 |
+
},
|
| 82 |
+
"128": {
|
| 83 |
+
"BLOCK_SIZE_M": 64,
|
| 84 |
+
"BLOCK_SIZE_N": 64,
|
| 85 |
+
"BLOCK_SIZE_K": 128,
|
| 86 |
+
"GROUP_SIZE_M": 16,
|
| 87 |
+
"num_warps": 4,
|
| 88 |
+
"num_stages": 3
|
| 89 |
+
},
|
| 90 |
+
"256": {
|
| 91 |
+
"BLOCK_SIZE_M": 128,
|
| 92 |
+
"BLOCK_SIZE_N": 64,
|
| 93 |
+
"BLOCK_SIZE_K": 128,
|
| 94 |
+
"GROUP_SIZE_M": 1,
|
| 95 |
+
"num_warps": 8,
|
| 96 |
+
"num_stages": 4
|
| 97 |
+
},
|
| 98 |
+
"512": {
|
| 99 |
+
"BLOCK_SIZE_M": 128,
|
| 100 |
+
"BLOCK_SIZE_N": 128,
|
| 101 |
+
"BLOCK_SIZE_K": 128,
|
| 102 |
+
"GROUP_SIZE_M": 1,
|
| 103 |
+
"num_warps": 8,
|
| 104 |
+
"num_stages": 2
|
| 105 |
+
},
|
| 106 |
+
"1024": {
|
| 107 |
+
"BLOCK_SIZE_M": 64,
|
| 108 |
+
"BLOCK_SIZE_N": 128,
|
| 109 |
+
"BLOCK_SIZE_K": 128,
|
| 110 |
+
"GROUP_SIZE_M": 16,
|
| 111 |
+
"num_warps": 4,
|
| 112 |
+
"num_stages": 3
|
| 113 |
+
},
|
| 114 |
+
"1536": {
|
| 115 |
+
"BLOCK_SIZE_M": 128,
|
| 116 |
+
"BLOCK_SIZE_N": 128,
|
| 117 |
+
"BLOCK_SIZE_K": 128,
|
| 118 |
+
"GROUP_SIZE_M": 32,
|
| 119 |
+
"num_warps": 8,
|
| 120 |
+
"num_stages": 2
|
| 121 |
+
},
|
| 122 |
+
"2048": {
|
| 123 |
+
"BLOCK_SIZE_M": 128,
|
| 124 |
+
"BLOCK_SIZE_N": 128,
|
| 125 |
+
"BLOCK_SIZE_K": 128,
|
| 126 |
+
"GROUP_SIZE_M": 1,
|
| 127 |
+
"num_warps": 8,
|
| 128 |
+
"num_stages": 2
|
| 129 |
+
},
|
| 130 |
+
"3072": {
|
| 131 |
+
"BLOCK_SIZE_M": 128,
|
| 132 |
+
"BLOCK_SIZE_N": 128,
|
| 133 |
+
"BLOCK_SIZE_K": 128,
|
| 134 |
+
"GROUP_SIZE_M": 16,
|
| 135 |
+
"num_warps": 8,
|
| 136 |
+
"num_stages": 2
|
| 137 |
+
},
|
| 138 |
+
"4096": {
|
| 139 |
+
"BLOCK_SIZE_M": 64,
|
| 140 |
+
"BLOCK_SIZE_N": 128,
|
| 141 |
+
"BLOCK_SIZE_K": 128,
|
| 142 |
+
"GROUP_SIZE_M": 16,
|
| 143 |
+
"num_warps": 4,
|
| 144 |
+
"num_stages": 3
|
| 145 |
+
}
|
| 146 |
+
}
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"1": {
|
| 3 |
+
"BLOCK_SIZE_M": 64,
|
| 4 |
+
"BLOCK_SIZE_N": 64,
|
| 5 |
+
"BLOCK_SIZE_K": 128,
|
| 6 |
+
"GROUP_SIZE_M": 32,
|
| 7 |
+
"num_warps": 4,
|
| 8 |
+
"num_stages": 5
|
| 9 |
+
},
|
| 10 |
+
"2": {
|
| 11 |
+
"BLOCK_SIZE_M": 64,
|
| 12 |
+
"BLOCK_SIZE_N": 64,
|
| 13 |
+
"BLOCK_SIZE_K": 128,
|
| 14 |
+
"GROUP_SIZE_M": 16,
|
| 15 |
+
"num_warps": 4,
|
| 16 |
+
"num_stages": 4
|
| 17 |
+
},
|
| 18 |
+
"4": {
|
| 19 |
+
"BLOCK_SIZE_M": 64,
|
| 20 |
+
"BLOCK_SIZE_N": 32,
|
| 21 |
+
"BLOCK_SIZE_K": 128,
|
| 22 |
+
"GROUP_SIZE_M": 1,
|
| 23 |
+
"num_warps": 4,
|
| 24 |
+
"num_stages": 5
|
| 25 |
+
},
|
| 26 |
+
"8": {
|
| 27 |
+
"BLOCK_SIZE_M": 64,
|
| 28 |
+
"BLOCK_SIZE_N": 32,
|
| 29 |
+
"BLOCK_SIZE_K": 128,
|
| 30 |
+
"GROUP_SIZE_M": 1,
|
| 31 |
+
"num_warps": 4,
|
| 32 |
+
"num_stages": 5
|
| 33 |
+
},
|
| 34 |
+
"16": {
|
| 35 |
+
"BLOCK_SIZE_M": 64,
|
| 36 |
+
"BLOCK_SIZE_N": 32,
|
| 37 |
+
"BLOCK_SIZE_K": 128,
|
| 38 |
+
"GROUP_SIZE_M": 1,
|
| 39 |
+
"num_warps": 4,
|
| 40 |
+
"num_stages": 5
|
| 41 |
+
},
|
| 42 |
+
"24": {
|
| 43 |
+
"BLOCK_SIZE_M": 64,
|
| 44 |
+
"BLOCK_SIZE_N": 32,
|
| 45 |
+
"BLOCK_SIZE_K": 128,
|
| 46 |
+
"GROUP_SIZE_M": 1,
|
| 47 |
+
"num_warps": 4,
|
| 48 |
+
"num_stages": 4
|
| 49 |
+
},
|
| 50 |
+
"32": {
|
| 51 |
+
"BLOCK_SIZE_M": 64,
|
| 52 |
+
"BLOCK_SIZE_N": 32,
|
| 53 |
+
"BLOCK_SIZE_K": 128,
|
| 54 |
+
"GROUP_SIZE_M": 1,
|
| 55 |
+
"num_warps": 4,
|
| 56 |
+
"num_stages": 5
|
| 57 |
+
},
|
| 58 |
+
"48": {
|
| 59 |
+
"BLOCK_SIZE_M": 64,
|
| 60 |
+
"BLOCK_SIZE_N": 32,
|
| 61 |
+
"BLOCK_SIZE_K": 128,
|
| 62 |
+
"GROUP_SIZE_M": 1,
|
| 63 |
+
"num_warps": 4,
|
| 64 |
+
"num_stages": 4
|
| 65 |
+
},
|
| 66 |
+
"64": {
|
| 67 |
+
"BLOCK_SIZE_M": 64,
|
| 68 |
+
"BLOCK_SIZE_N": 32,
|
| 69 |
+
"BLOCK_SIZE_K": 128,
|
| 70 |
+
"GROUP_SIZE_M": 1,
|
| 71 |
+
"num_warps": 4,
|
| 72 |
+
"num_stages": 4
|
| 73 |
+
},
|
| 74 |
+
"96": {
|
| 75 |
+
"BLOCK_SIZE_M": 64,
|
| 76 |
+
"BLOCK_SIZE_N": 64,
|
| 77 |
+
"BLOCK_SIZE_K": 128,
|
| 78 |
+
"GROUP_SIZE_M": 32,
|
| 79 |
+
"num_warps": 4,
|
| 80 |
+
"num_stages": 4
|
| 81 |
+
},
|
| 82 |
+
"128": {
|
| 83 |
+
"BLOCK_SIZE_M": 64,
|
| 84 |
+
"BLOCK_SIZE_N": 64,
|
| 85 |
+
"BLOCK_SIZE_K": 128,
|
| 86 |
+
"GROUP_SIZE_M": 32,
|
| 87 |
+
"num_warps": 4,
|
| 88 |
+
"num_stages": 4
|
| 89 |
+
},
|
| 90 |
+
"256": {
|
| 91 |
+
"BLOCK_SIZE_M": 64,
|
| 92 |
+
"BLOCK_SIZE_N": 32,
|
| 93 |
+
"BLOCK_SIZE_K": 128,
|
| 94 |
+
"GROUP_SIZE_M": 1,
|
| 95 |
+
"num_warps": 4,
|
| 96 |
+
"num_stages": 3
|
| 97 |
+
},
|
| 98 |
+
"512": {
|
| 99 |
+
"BLOCK_SIZE_M": 64,
|
| 100 |
+
"BLOCK_SIZE_N": 64,
|
| 101 |
+
"BLOCK_SIZE_K": 128,
|
| 102 |
+
"GROUP_SIZE_M": 1,
|
| 103 |
+
"num_warps": 4,
|
| 104 |
+
"num_stages": 3
|
| 105 |
+
},
|
| 106 |
+
"1024": {
|
| 107 |
+
"BLOCK_SIZE_M": 64,
|
| 108 |
+
"BLOCK_SIZE_N": 64,
|
| 109 |
+
"BLOCK_SIZE_K": 128,
|
| 110 |
+
"GROUP_SIZE_M": 1,
|
| 111 |
+
"num_warps": 4,
|
| 112 |
+
"num_stages": 3
|
| 113 |
+
},
|
| 114 |
+
"1536": {
|
| 115 |
+
"BLOCK_SIZE_M": 64,
|
| 116 |
+
"BLOCK_SIZE_N": 64,
|
| 117 |
+
"BLOCK_SIZE_K": 128,
|
| 118 |
+
"GROUP_SIZE_M": 1,
|
| 119 |
+
"num_warps": 4,
|
| 120 |
+
"num_stages": 3
|
| 121 |
+
},
|
| 122 |
+
"2048": {
|
| 123 |
+
"BLOCK_SIZE_M": 64,
|
| 124 |
+
"BLOCK_SIZE_N": 128,
|
| 125 |
+
"BLOCK_SIZE_K": 128,
|
| 126 |
+
"GROUP_SIZE_M": 64,
|
| 127 |
+
"num_warps": 4,
|
| 128 |
+
"num_stages": 3
|
| 129 |
+
},
|
| 130 |
+
"3072": {
|
| 131 |
+
"BLOCK_SIZE_M": 64,
|
| 132 |
+
"BLOCK_SIZE_N": 128,
|
| 133 |
+
"BLOCK_SIZE_K": 128,
|
| 134 |
+
"GROUP_SIZE_M": 16,
|
| 135 |
+
"num_warps": 4,
|
| 136 |
+
"num_stages": 3
|
| 137 |
+
},
|
| 138 |
+
"4096": {
|
| 139 |
+
"BLOCK_SIZE_M": 64,
|
| 140 |
+
"BLOCK_SIZE_N": 128,
|
| 141 |
+
"BLOCK_SIZE_K": 128,
|
| 142 |
+
"GROUP_SIZE_M": 64,
|
| 143 |
+
"num_warps": 4,
|
| 144 |
+
"num_stages": 3
|
| 145 |
+
}
|
| 146 |
+
}
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"1": {
|
| 3 |
+
"BLOCK_SIZE_M": 64,
|
| 4 |
+
"BLOCK_SIZE_N": 64,
|
| 5 |
+
"BLOCK_SIZE_K": 128,
|
| 6 |
+
"GROUP_SIZE_M": 16,
|
| 7 |
+
"num_warps": 4,
|
| 8 |
+
"num_stages": 5
|
| 9 |
+
},
|
| 10 |
+
"2": {
|
| 11 |
+
"BLOCK_SIZE_M": 64,
|
| 12 |
+
"BLOCK_SIZE_N": 64,
|
| 13 |
+
"BLOCK_SIZE_K": 128,
|
| 14 |
+
"GROUP_SIZE_M": 1,
|
| 15 |
+
"num_warps": 4,
|
| 16 |
+
"num_stages": 5
|
| 17 |
+
},
|
| 18 |
+
"4": {
|
| 19 |
+
"BLOCK_SIZE_M": 64,
|
| 20 |
+
"BLOCK_SIZE_N": 64,
|
| 21 |
+
"BLOCK_SIZE_K": 128,
|
| 22 |
+
"GROUP_SIZE_M": 64,
|
| 23 |
+
"num_warps": 4,
|
| 24 |
+
"num_stages": 4
|
| 25 |
+
},
|
| 26 |
+
"8": {
|
| 27 |
+
"BLOCK_SIZE_M": 64,
|
| 28 |
+
"BLOCK_SIZE_N": 32,
|
| 29 |
+
"BLOCK_SIZE_K": 128,
|
| 30 |
+
"GROUP_SIZE_M": 1,
|
| 31 |
+
"num_warps": 4,
|
| 32 |
+
"num_stages": 5
|
| 33 |
+
},
|
| 34 |
+
"16": {
|
| 35 |
+
"BLOCK_SIZE_M": 64,
|
| 36 |
+
"BLOCK_SIZE_N": 32,
|
| 37 |
+
"BLOCK_SIZE_K": 128,
|
| 38 |
+
"GROUP_SIZE_M": 1,
|
| 39 |
+
"num_warps": 4,
|
| 40 |
+
"num_stages": 4
|
| 41 |
+
},
|
| 42 |
+
"24": {
|
| 43 |
+
"BLOCK_SIZE_M": 64,
|
| 44 |
+
"BLOCK_SIZE_N": 32,
|
| 45 |
+
"BLOCK_SIZE_K": 128,
|
| 46 |
+
"GROUP_SIZE_M": 16,
|
| 47 |
+
"num_warps": 4,
|
| 48 |
+
"num_stages": 4
|
| 49 |
+
},
|
| 50 |
+
"32": {
|
| 51 |
+
"BLOCK_SIZE_M": 64,
|
| 52 |
+
"BLOCK_SIZE_N": 32,
|
| 53 |
+
"BLOCK_SIZE_K": 128,
|
| 54 |
+
"GROUP_SIZE_M": 16,
|
| 55 |
+
"num_warps": 4,
|
| 56 |
+
"num_stages": 4
|
| 57 |
+
},
|
| 58 |
+
"48": {
|
| 59 |
+
"BLOCK_SIZE_M": 64,
|
| 60 |
+
"BLOCK_SIZE_N": 32,
|
| 61 |
+
"BLOCK_SIZE_K": 128,
|
| 62 |
+
"GROUP_SIZE_M": 1,
|
| 63 |
+
"num_warps": 4,
|
| 64 |
+
"num_stages": 4
|
| 65 |
+
},
|
| 66 |
+
"64": {
|
| 67 |
+
"BLOCK_SIZE_M": 64,
|
| 68 |
+
"BLOCK_SIZE_N": 32,
|
| 69 |
+
"BLOCK_SIZE_K": 128,
|
| 70 |
+
"GROUP_SIZE_M": 32,
|
| 71 |
+
"num_warps": 4,
|
| 72 |
+
"num_stages": 4
|
| 73 |
+
},
|
| 74 |
+
"96": {
|
| 75 |
+
"BLOCK_SIZE_M": 64,
|
| 76 |
+
"BLOCK_SIZE_N": 64,
|
| 77 |
+
"BLOCK_SIZE_K": 128,
|
| 78 |
+
"GROUP_SIZE_M": 64,
|
| 79 |
+
"num_warps": 4,
|
| 80 |
+
"num_stages": 4
|
| 81 |
+
},
|
| 82 |
+
"128": {
|
| 83 |
+
"BLOCK_SIZE_M": 64,
|
| 84 |
+
"BLOCK_SIZE_N": 64,
|
| 85 |
+
"BLOCK_SIZE_K": 128,
|
| 86 |
+
"GROUP_SIZE_M": 16,
|
| 87 |
+
"num_warps": 4,
|
| 88 |
+
"num_stages": 4
|
| 89 |
+
},
|
| 90 |
+
"256": {
|
| 91 |
+
"BLOCK_SIZE_M": 64,
|
| 92 |
+
"BLOCK_SIZE_N": 32,
|
| 93 |
+
"BLOCK_SIZE_K": 128,
|
| 94 |
+
"GROUP_SIZE_M": 1,
|
| 95 |
+
"num_warps": 4,
|
| 96 |
+
"num_stages": 3
|
| 97 |
+
},
|
| 98 |
+
"512": {
|
| 99 |
+
"BLOCK_SIZE_M": 64,
|
| 100 |
+
"BLOCK_SIZE_N": 64,
|
| 101 |
+
"BLOCK_SIZE_K": 128,
|
| 102 |
+
"GROUP_SIZE_M": 1,
|
| 103 |
+
"num_warps": 4,
|
| 104 |
+
"num_stages": 3
|
| 105 |
+
},
|
| 106 |
+
"1024": {
|
| 107 |
+
"BLOCK_SIZE_M": 64,
|
| 108 |
+
"BLOCK_SIZE_N": 64,
|
| 109 |
+
"BLOCK_SIZE_K": 128,
|
| 110 |
+
"GROUP_SIZE_M": 64,
|
| 111 |
+
"num_warps": 4,
|
| 112 |
+
"num_stages": 3
|
| 113 |
+
},
|
| 114 |
+
"1536": {
|
| 115 |
+
"BLOCK_SIZE_M": 64,
|
| 116 |
+
"BLOCK_SIZE_N": 64,
|
| 117 |
+
"BLOCK_SIZE_K": 128,
|
| 118 |
+
"GROUP_SIZE_M": 64,
|
| 119 |
+
"num_warps": 4,
|
| 120 |
+
"num_stages": 3
|
| 121 |
+
},
|
| 122 |
+
"2048": {
|
| 123 |
+
"BLOCK_SIZE_M": 64,
|
| 124 |
+
"BLOCK_SIZE_N": 128,
|
| 125 |
+
"BLOCK_SIZE_K": 128,
|
| 126 |
+
"GROUP_SIZE_M": 16,
|
| 127 |
+
"num_warps": 4,
|
| 128 |
+
"num_stages": 3
|
| 129 |
+
},
|
| 130 |
+
"3072": {
|
| 131 |
+
"BLOCK_SIZE_M": 64,
|
| 132 |
+
"BLOCK_SIZE_N": 128,
|
| 133 |
+
"BLOCK_SIZE_K": 128,
|
| 134 |
+
"GROUP_SIZE_M": 64,
|
| 135 |
+
"num_warps": 4,
|
| 136 |
+
"num_stages": 3
|
| 137 |
+
},
|
| 138 |
+
"4096": {
|
| 139 |
+
"BLOCK_SIZE_M": 64,
|
| 140 |
+
"BLOCK_SIZE_N": 128,
|
| 141 |
+
"BLOCK_SIZE_K": 128,
|
| 142 |
+
"GROUP_SIZE_M": 64,
|
| 143 |
+
"num_warps": 4,
|
| 144 |
+
"num_stages": 3
|
| 145 |
+
}
|
| 146 |
+
}
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"1": {
|
| 3 |
+
"BLOCK_SIZE_M": 16,
|
| 4 |
+
"BLOCK_SIZE_N": 256,
|
| 5 |
+
"BLOCK_SIZE_K": 128,
|
| 6 |
+
"GROUP_SIZE_M": 16,
|
| 7 |
+
"num_warps": 8,
|
| 8 |
+
"num_stages": 3
|
| 9 |
+
},
|
| 10 |
+
"2": {
|
| 11 |
+
"BLOCK_SIZE_M": 16,
|
| 12 |
+
"BLOCK_SIZE_N": 256,
|
| 13 |
+
"BLOCK_SIZE_K": 128,
|
| 14 |
+
"GROUP_SIZE_M": 16,
|
| 15 |
+
"num_warps": 8,
|
| 16 |
+
"num_stages": 3
|
| 17 |
+
},
|
| 18 |
+
"4": {
|
| 19 |
+
"BLOCK_SIZE_M": 16,
|
| 20 |
+
"BLOCK_SIZE_N": 256,
|
| 21 |
+
"BLOCK_SIZE_K": 128,
|
| 22 |
+
"GROUP_SIZE_M": 64,
|
| 23 |
+
"num_warps": 8,
|
| 24 |
+
"num_stages": 5
|
| 25 |
+
},
|
| 26 |
+
"8": {
|
| 27 |
+
"BLOCK_SIZE_M": 16,
|
| 28 |
+
"BLOCK_SIZE_N": 256,
|
| 29 |
+
"BLOCK_SIZE_K": 128,
|
| 30 |
+
"GROUP_SIZE_M": 32,
|
| 31 |
+
"num_warps": 8,
|
| 32 |
+
"num_stages": 3
|
| 33 |
+
},
|
| 34 |
+
"16": {
|
| 35 |
+
"BLOCK_SIZE_M": 16,
|
| 36 |
+
"BLOCK_SIZE_N": 128,
|
| 37 |
+
"BLOCK_SIZE_K": 128,
|
| 38 |
+
"GROUP_SIZE_M": 32,
|
| 39 |
+
"num_warps": 4,
|
| 40 |
+
"num_stages": 2
|
| 41 |
+
},
|
| 42 |
+
"24": {
|
| 43 |
+
"BLOCK_SIZE_M": 32,
|
| 44 |
+
"BLOCK_SIZE_N": 256,
|
| 45 |
+
"BLOCK_SIZE_K": 128,
|
| 46 |
+
"GROUP_SIZE_M": 16,
|
| 47 |
+
"num_warps": 8,
|
| 48 |
+
"num_stages": 3
|
| 49 |
+
},
|
| 50 |
+
"32": {
|
| 51 |
+
"BLOCK_SIZE_M": 32,
|
| 52 |
+
"BLOCK_SIZE_N": 256,
|
| 53 |
+
"BLOCK_SIZE_K": 128,
|
| 54 |
+
"GROUP_SIZE_M": 16,
|
| 55 |
+
"num_warps": 8,
|
| 56 |
+
"num_stages": 3
|
| 57 |
+
},
|
| 58 |
+
"48": {
|
| 59 |
+
"BLOCK_SIZE_M": 64,
|
| 60 |
+
"BLOCK_SIZE_N": 256,
|
| 61 |
+
"BLOCK_SIZE_K": 128,
|
| 62 |
+
"GROUP_SIZE_M": 32,
|
| 63 |
+
"num_warps": 8,
|
| 64 |
+
"num_stages": 5
|
| 65 |
+
},
|
| 66 |
+
"64": {
|
| 67 |
+
"BLOCK_SIZE_M": 64,
|
| 68 |
+
"BLOCK_SIZE_N": 256,
|
| 69 |
+
"BLOCK_SIZE_K": 128,
|
| 70 |
+
"GROUP_SIZE_M": 32,
|
| 71 |
+
"num_warps": 8,
|
| 72 |
+
"num_stages": 3
|
| 73 |
+
},
|
| 74 |
+
"96": {
|
| 75 |
+
"BLOCK_SIZE_M": 128,
|
| 76 |
+
"BLOCK_SIZE_N": 128,
|
| 77 |
+
"BLOCK_SIZE_K": 128,
|
| 78 |
+
"GROUP_SIZE_M": 1,
|
| 79 |
+
"num_warps": 8,
|
| 80 |
+
"num_stages": 2
|
| 81 |
+
},
|
| 82 |
+
"128": {
|
| 83 |
+
"BLOCK_SIZE_M": 128,
|
| 84 |
+
"BLOCK_SIZE_N": 128,
|
| 85 |
+
"BLOCK_SIZE_K": 128,
|
| 86 |
+
"GROUP_SIZE_M": 64,
|
| 87 |
+
"num_warps": 8,
|
| 88 |
+
"num_stages": 2
|
| 89 |
+
},
|
| 90 |
+
"256": {
|
| 91 |
+
"BLOCK_SIZE_M": 64,
|
| 92 |
+
"BLOCK_SIZE_N": 128,
|
| 93 |
+
"BLOCK_SIZE_K": 128,
|
| 94 |
+
"GROUP_SIZE_M": 32,
|
| 95 |
+
"num_warps": 4,
|
| 96 |
+
"num_stages": 3
|
| 97 |
+
},
|
| 98 |
+
"512": {
|
| 99 |
+
"BLOCK_SIZE_M": 64,
|
| 100 |
+
"BLOCK_SIZE_N": 128,
|
| 101 |
+
"BLOCK_SIZE_K": 128,
|
| 102 |
+
"GROUP_SIZE_M": 16,
|
| 103 |
+
"num_warps": 4,
|
| 104 |
+
"num_stages": 2
|
| 105 |
+
},
|
| 106 |
+
"1024": {
|
| 107 |
+
"BLOCK_SIZE_M": 64,
|
| 108 |
+
"BLOCK_SIZE_N": 128,
|
| 109 |
+
"BLOCK_SIZE_K": 128,
|
| 110 |
+
"GROUP_SIZE_M": 1,
|
| 111 |
+
"num_warps": 4,
|
| 112 |
+
"num_stages": 2
|
| 113 |
+
},
|
| 114 |
+
"1536": {
|
| 115 |
+
"BLOCK_SIZE_M": 64,
|
| 116 |
+
"BLOCK_SIZE_N": 128,
|
| 117 |
+
"BLOCK_SIZE_K": 128,
|
| 118 |
+
"GROUP_SIZE_M": 1,
|
| 119 |
+
"num_warps": 4,
|
| 120 |
+
"num_stages": 2
|
| 121 |
+
},
|
| 122 |
+
"2048": {
|
| 123 |
+
"BLOCK_SIZE_M": 64,
|
| 124 |
+
"BLOCK_SIZE_N": 128,
|
| 125 |
+
"BLOCK_SIZE_K": 128,
|
| 126 |
+
"GROUP_SIZE_M": 1,
|
| 127 |
+
"num_warps": 4,
|
| 128 |
+
"num_stages": 2
|
| 129 |
+
},
|
| 130 |
+
"3072": {
|
| 131 |
+
"BLOCK_SIZE_M": 64,
|
| 132 |
+
"BLOCK_SIZE_N": 128,
|
| 133 |
+
"BLOCK_SIZE_K": 128,
|
| 134 |
+
"GROUP_SIZE_M": 1,
|
| 135 |
+
"num_warps": 4,
|
| 136 |
+
"num_stages": 2
|
| 137 |
+
},
|
| 138 |
+
"4096": {
|
| 139 |
+
"BLOCK_SIZE_M": 64,
|
| 140 |
+
"BLOCK_SIZE_N": 128,
|
| 141 |
+
"BLOCK_SIZE_K": 128,
|
| 142 |
+
"GROUP_SIZE_M": 1,
|
| 143 |
+
"num_warps": 4,
|
| 144 |
+
"num_stages": 2
|
| 145 |
+
}
|
| 146 |
+
}
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"1": {
|
| 3 |
+
"BLOCK_SIZE_M": 64,
|
| 4 |
+
"BLOCK_SIZE_N": 256,
|
| 5 |
+
"BLOCK_SIZE_K": 128,
|
| 6 |
+
"GROUP_SIZE_M": 1,
|
| 7 |
+
"num_warps": 8,
|
| 8 |
+
"num_stages": 4
|
| 9 |
+
},
|
| 10 |
+
"2": {
|
| 11 |
+
"BLOCK_SIZE_M": 64,
|
| 12 |
+
"BLOCK_SIZE_N": 256,
|
| 13 |
+
"BLOCK_SIZE_K": 128,
|
| 14 |
+
"GROUP_SIZE_M": 64,
|
| 15 |
+
"num_warps": 8,
|
| 16 |
+
"num_stages": 4
|
| 17 |
+
},
|
| 18 |
+
"4": {
|
| 19 |
+
"BLOCK_SIZE_M": 64,
|
| 20 |
+
"BLOCK_SIZE_N": 256,
|
| 21 |
+
"BLOCK_SIZE_K": 128,
|
| 22 |
+
"GROUP_SIZE_M": 1,
|
| 23 |
+
"num_warps": 8,
|
| 24 |
+
"num_stages": 3
|
| 25 |
+
},
|
| 26 |
+
"8": {
|
| 27 |
+
"BLOCK_SIZE_M": 64,
|
| 28 |
+
"BLOCK_SIZE_N": 128,
|
| 29 |
+
"BLOCK_SIZE_K": 128,
|
| 30 |
+
"GROUP_SIZE_M": 32,
|
| 31 |
+
"num_warps": 4,
|
| 32 |
+
"num_stages": 3
|
| 33 |
+
},
|
| 34 |
+
"16": {
|
| 35 |
+
"BLOCK_SIZE_M": 64,
|
| 36 |
+
"BLOCK_SIZE_N": 128,
|
| 37 |
+
"BLOCK_SIZE_K": 128,
|
| 38 |
+
"GROUP_SIZE_M": 64,
|
| 39 |
+
"num_warps": 4,
|
| 40 |
+
"num_stages": 3
|
| 41 |
+
},
|
| 42 |
+
"24": {
|
| 43 |
+
"BLOCK_SIZE_M": 64,
|
| 44 |
+
"BLOCK_SIZE_N": 128,
|
| 45 |
+
"BLOCK_SIZE_K": 128,
|
| 46 |
+
"GROUP_SIZE_M": 32,
|
| 47 |
+
"num_warps": 4,
|
| 48 |
+
"num_stages": 3
|
| 49 |
+
},
|
| 50 |
+
"32": {
|
| 51 |
+
"BLOCK_SIZE_M": 64,
|
| 52 |
+
"BLOCK_SIZE_N": 128,
|
| 53 |
+
"BLOCK_SIZE_K": 128,
|
| 54 |
+
"GROUP_SIZE_M": 16,
|
| 55 |
+
"num_warps": 4,
|
| 56 |
+
"num_stages": 3
|
| 57 |
+
},
|
| 58 |
+
"48": {
|
| 59 |
+
"BLOCK_SIZE_M": 64,
|
| 60 |
+
"BLOCK_SIZE_N": 128,
|
| 61 |
+
"BLOCK_SIZE_K": 128,
|
| 62 |
+
"GROUP_SIZE_M": 64,
|
| 63 |
+
"num_warps": 4,
|
| 64 |
+
"num_stages": 3
|
| 65 |
+
},
|
| 66 |
+
"64": {
|
| 67 |
+
"BLOCK_SIZE_M": 64,
|
| 68 |
+
"BLOCK_SIZE_N": 128,
|
| 69 |
+
"BLOCK_SIZE_K": 128,
|
| 70 |
+
"GROUP_SIZE_M": 16,
|
| 71 |
+
"num_warps": 4,
|
| 72 |
+
"num_stages": 3
|
| 73 |
+
},
|
| 74 |
+
"96": {
|
| 75 |
+
"BLOCK_SIZE_M": 64,
|
| 76 |
+
"BLOCK_SIZE_N": 128,
|
| 77 |
+
"BLOCK_SIZE_K": 128,
|
| 78 |
+
"GROUP_SIZE_M": 16,
|
| 79 |
+
"num_warps": 4,
|
| 80 |
+
"num_stages": 3
|
| 81 |
+
},
|
| 82 |
+
"128": {
|
| 83 |
+
"BLOCK_SIZE_M": 64,
|
| 84 |
+
"BLOCK_SIZE_N": 128,
|
| 85 |
+
"BLOCK_SIZE_K": 128,
|
| 86 |
+
"GROUP_SIZE_M": 32,
|
| 87 |
+
"num_warps": 4,
|
| 88 |
+
"num_stages": 3
|
| 89 |
+
},
|
| 90 |
+
"256": {
|
| 91 |
+
"BLOCK_SIZE_M": 64,
|
| 92 |
+
"BLOCK_SIZE_N": 128,
|
| 93 |
+
"BLOCK_SIZE_K": 128,
|
| 94 |
+
"GROUP_SIZE_M": 16,
|
| 95 |
+
"num_warps": 4,
|
| 96 |
+
"num_stages": 3
|
| 97 |
+
},
|
| 98 |
+
"512": {
|
| 99 |
+
"BLOCK_SIZE_M": 64,
|
| 100 |
+
"BLOCK_SIZE_N": 128,
|
| 101 |
+
"BLOCK_SIZE_K": 128,
|
| 102 |
+
"GROUP_SIZE_M": 1,
|
| 103 |
+
"num_warps": 4,
|
| 104 |
+
"num_stages": 3
|
| 105 |
+
},
|
| 106 |
+
"1024": {
|
| 107 |
+
"BLOCK_SIZE_M": 64,
|
| 108 |
+
"BLOCK_SIZE_N": 128,
|
| 109 |
+
"BLOCK_SIZE_K": 128,
|
| 110 |
+
"GROUP_SIZE_M": 1,
|
| 111 |
+
"num_warps": 4,
|
| 112 |
+
"num_stages": 3
|
| 113 |
+
},
|
| 114 |
+
"1536": {
|
| 115 |
+
"BLOCK_SIZE_M": 64,
|
| 116 |
+
"BLOCK_SIZE_N": 128,
|
| 117 |
+
"BLOCK_SIZE_K": 128,
|
| 118 |
+
"GROUP_SIZE_M": 1,
|
| 119 |
+
"num_warps": 4,
|
| 120 |
+
"num_stages": 3
|
| 121 |
+
},
|
| 122 |
+
"2048": {
|
| 123 |
+
"BLOCK_SIZE_M": 64,
|
| 124 |
+
"BLOCK_SIZE_N": 128,
|
| 125 |
+
"BLOCK_SIZE_K": 128,
|
| 126 |
+
"GROUP_SIZE_M": 32,
|
| 127 |
+
"num_warps": 4,
|
| 128 |
+
"num_stages": 3
|
| 129 |
+
},
|
| 130 |
+
"3072": {
|
| 131 |
+
"BLOCK_SIZE_M": 64,
|
| 132 |
+
"BLOCK_SIZE_N": 128,
|
| 133 |
+
"BLOCK_SIZE_K": 128,
|
| 134 |
+
"GROUP_SIZE_M": 1,
|
| 135 |
+
"num_warps": 4,
|
| 136 |
+
"num_stages": 3
|
| 137 |
+
},
|
| 138 |
+
"4096": {
|
| 139 |
+
"BLOCK_SIZE_M": 64,
|
| 140 |
+
"BLOCK_SIZE_N": 128,
|
| 141 |
+
"BLOCK_SIZE_K": 128,
|
| 142 |
+
"GROUP_SIZE_M": 1,
|
| 143 |
+
"num_warps": 4,
|
| 144 |
+
"num_stages": 3
|
| 145 |
+
}
|
| 146 |
+
}
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"1": {
|
| 3 |
+
"BLOCK_SIZE_M": 64,
|
| 4 |
+
"BLOCK_SIZE_N": 256,
|
| 5 |
+
"BLOCK_SIZE_K": 128,
|
| 6 |
+
"GROUP_SIZE_M": 1,
|
| 7 |
+
"num_warps": 8,
|
| 8 |
+
"num_stages": 4
|
| 9 |
+
},
|
| 10 |
+
"2": {
|
| 11 |
+
"BLOCK_SIZE_M": 64,
|
| 12 |
+
"BLOCK_SIZE_N": 256,
|
| 13 |
+
"BLOCK_SIZE_K": 128,
|
| 14 |
+
"GROUP_SIZE_M": 16,
|
| 15 |
+
"num_warps": 8,
|
| 16 |
+
"num_stages": 4
|
| 17 |
+
},
|
| 18 |
+
"4": {
|
| 19 |
+
"BLOCK_SIZE_M": 64,
|
| 20 |
+
"BLOCK_SIZE_N": 256,
|
| 21 |
+
"BLOCK_SIZE_K": 128,
|
| 22 |
+
"GROUP_SIZE_M": 1,
|
| 23 |
+
"num_warps": 8,
|
| 24 |
+
"num_stages": 3
|
| 25 |
+
},
|
| 26 |
+
"8": {
|
| 27 |
+
"BLOCK_SIZE_M": 64,
|
| 28 |
+
"BLOCK_SIZE_N": 128,
|
| 29 |
+
"BLOCK_SIZE_K": 128,
|
| 30 |
+
"GROUP_SIZE_M": 16,
|
| 31 |
+
"num_warps": 4,
|
| 32 |
+
"num_stages": 3
|
| 33 |
+
},
|
| 34 |
+
"16": {
|
| 35 |
+
"BLOCK_SIZE_M": 64,
|
| 36 |
+
"BLOCK_SIZE_N": 128,
|
| 37 |
+
"BLOCK_SIZE_K": 128,
|
| 38 |
+
"GROUP_SIZE_M": 32,
|
| 39 |
+
"num_warps": 4,
|
| 40 |
+
"num_stages": 3
|
| 41 |
+
},
|
| 42 |
+
"24": {
|
| 43 |
+
"BLOCK_SIZE_M": 64,
|
| 44 |
+
"BLOCK_SIZE_N": 128,
|
| 45 |
+
"BLOCK_SIZE_K": 128,
|
| 46 |
+
"GROUP_SIZE_M": 32,
|
| 47 |
+
"num_warps": 4,
|
| 48 |
+
"num_stages": 3
|
| 49 |
+
},
|
| 50 |
+
"32": {
|
| 51 |
+
"BLOCK_SIZE_M": 64,
|
| 52 |
+
"BLOCK_SIZE_N": 128,
|
| 53 |
+
"BLOCK_SIZE_K": 128,
|
| 54 |
+
"GROUP_SIZE_M": 64,
|
| 55 |
+
"num_warps": 4,
|
| 56 |
+
"num_stages": 3
|
| 57 |
+
},
|
| 58 |
+
"48": {
|
| 59 |
+
"BLOCK_SIZE_M": 64,
|
| 60 |
+
"BLOCK_SIZE_N": 128,
|
| 61 |
+
"BLOCK_SIZE_K": 128,
|
| 62 |
+
"GROUP_SIZE_M": 32,
|
| 63 |
+
"num_warps": 4,
|
| 64 |
+
"num_stages": 3
|
| 65 |
+
},
|
| 66 |
+
"64": {
|
| 67 |
+
"BLOCK_SIZE_M": 64,
|
| 68 |
+
"BLOCK_SIZE_N": 128,
|
| 69 |
+
"BLOCK_SIZE_K": 128,
|
| 70 |
+
"GROUP_SIZE_M": 16,
|
| 71 |
+
"num_warps": 4,
|
| 72 |
+
"num_stages": 3
|
| 73 |
+
},
|
| 74 |
+
"96": {
|
| 75 |
+
"BLOCK_SIZE_M": 64,
|
| 76 |
+
"BLOCK_SIZE_N": 128,
|
| 77 |
+
"BLOCK_SIZE_K": 128,
|
| 78 |
+
"GROUP_SIZE_M": 16,
|
| 79 |
+
"num_warps": 4,
|
| 80 |
+
"num_stages": 3
|
| 81 |
+
},
|
| 82 |
+
"128": {
|
| 83 |
+
"BLOCK_SIZE_M": 64,
|
| 84 |
+
"BLOCK_SIZE_N": 128,
|
| 85 |
+
"BLOCK_SIZE_K": 128,
|
| 86 |
+
"GROUP_SIZE_M": 32,
|
| 87 |
+
"num_warps": 4,
|
| 88 |
+
"num_stages": 3
|
| 89 |
+
},
|
| 90 |
+
"256": {
|
| 91 |
+
"BLOCK_SIZE_M": 64,
|
| 92 |
+
"BLOCK_SIZE_N": 128,
|
| 93 |
+
"BLOCK_SIZE_K": 128,
|
| 94 |
+
"GROUP_SIZE_M": 16,
|
| 95 |
+
"num_warps": 4,
|
| 96 |
+
"num_stages": 3
|
| 97 |
+
},
|
| 98 |
+
"512": {
|
| 99 |
+
"BLOCK_SIZE_M": 64,
|
| 100 |
+
"BLOCK_SIZE_N": 128,
|
| 101 |
+
"BLOCK_SIZE_K": 128,
|
| 102 |
+
"GROUP_SIZE_M": 1,
|
| 103 |
+
"num_warps": 4,
|
| 104 |
+
"num_stages": 3
|
| 105 |
+
},
|
| 106 |
+
"1024": {
|
| 107 |
+
"BLOCK_SIZE_M": 64,
|
| 108 |
+
"BLOCK_SIZE_N": 128,
|
| 109 |
+
"BLOCK_SIZE_K": 128,
|
| 110 |
+
"GROUP_SIZE_M": 1,
|
| 111 |
+
"num_warps": 4,
|
| 112 |
+
"num_stages": 3
|
| 113 |
+
},
|
| 114 |
+
"1536": {
|
| 115 |
+
"BLOCK_SIZE_M": 64,
|
| 116 |
+
"BLOCK_SIZE_N": 128,
|
| 117 |
+
"BLOCK_SIZE_K": 128,
|
| 118 |
+
"GROUP_SIZE_M": 1,
|
| 119 |
+
"num_warps": 4,
|
| 120 |
+
"num_stages": 3
|
| 121 |
+
},
|
| 122 |
+
"2048": {
|
| 123 |
+
"BLOCK_SIZE_M": 64,
|
| 124 |
+
"BLOCK_SIZE_N": 128,
|
| 125 |
+
"BLOCK_SIZE_K": 128,
|
| 126 |
+
"GROUP_SIZE_M": 1,
|
| 127 |
+
"num_warps": 4,
|
| 128 |
+
"num_stages": 3
|
| 129 |
+
},
|
| 130 |
+
"3072": {
|
| 131 |
+
"BLOCK_SIZE_M": 64,
|
| 132 |
+
"BLOCK_SIZE_N": 128,
|
| 133 |
+
"BLOCK_SIZE_K": 128,
|
| 134 |
+
"GROUP_SIZE_M": 1,
|
| 135 |
+
"num_warps": 4,
|
| 136 |
+
"num_stages": 3
|
| 137 |
+
},
|
| 138 |
+
"4096": {
|
| 139 |
+
"BLOCK_SIZE_M": 64,
|
| 140 |
+
"BLOCK_SIZE_N": 128,
|
| 141 |
+
"BLOCK_SIZE_K": 128,
|
| 142 |
+
"GROUP_SIZE_M": 1,
|
| 143 |
+
"num_warps": 4,
|
| 144 |
+
"num_stages": 3
|
| 145 |
+
}
|
| 146 |
+
}
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"1": {
|
| 3 |
+
"BLOCK_SIZE_M": 64,
|
| 4 |
+
"BLOCK_SIZE_N": 128,
|
| 5 |
+
"BLOCK_SIZE_K": 128,
|
| 6 |
+
"GROUP_SIZE_M": 16,
|
| 7 |
+
"num_warps": 4,
|
| 8 |
+
"num_stages": 5
|
| 9 |
+
},
|
| 10 |
+
"2": {
|
| 11 |
+
"BLOCK_SIZE_M": 64,
|
| 12 |
+
"BLOCK_SIZE_N": 64,
|
| 13 |
+
"BLOCK_SIZE_K": 128,
|
| 14 |
+
"GROUP_SIZE_M": 32,
|
| 15 |
+
"num_warps": 4,
|
| 16 |
+
"num_stages": 5
|
| 17 |
+
},
|
| 18 |
+
"4": {
|
| 19 |
+
"BLOCK_SIZE_M": 64,
|
| 20 |
+
"BLOCK_SIZE_N": 128,
|
| 21 |
+
"BLOCK_SIZE_K": 128,
|
| 22 |
+
"GROUP_SIZE_M": 16,
|
| 23 |
+
"num_warps": 4,
|
| 24 |
+
"num_stages": 3
|
| 25 |
+
},
|
| 26 |
+
"8": {
|
| 27 |
+
"BLOCK_SIZE_M": 64,
|
| 28 |
+
"BLOCK_SIZE_N": 64,
|
| 29 |
+
"BLOCK_SIZE_K": 128,
|
| 30 |
+
"GROUP_SIZE_M": 1,
|
| 31 |
+
"num_warps": 4,
|
| 32 |
+
"num_stages": 5
|
| 33 |
+
},
|
| 34 |
+
"16": {
|
| 35 |
+
"BLOCK_SIZE_M": 64,
|
| 36 |
+
"BLOCK_SIZE_N": 64,
|
| 37 |
+
"BLOCK_SIZE_K": 128,
|
| 38 |
+
"GROUP_SIZE_M": 64,
|
| 39 |
+
"num_warps": 4,
|
| 40 |
+
"num_stages": 3
|
| 41 |
+
},
|
| 42 |
+
"24": {
|
| 43 |
+
"BLOCK_SIZE_M": 64,
|
| 44 |
+
"BLOCK_SIZE_N": 64,
|
| 45 |
+
"BLOCK_SIZE_K": 128,
|
| 46 |
+
"GROUP_SIZE_M": 64,
|
| 47 |
+
"num_warps": 4,
|
| 48 |
+
"num_stages": 3
|
| 49 |
+
},
|
| 50 |
+
"32": {
|
| 51 |
+
"BLOCK_SIZE_M": 64,
|
| 52 |
+
"BLOCK_SIZE_N": 64,
|
| 53 |
+
"BLOCK_SIZE_K": 128,
|
| 54 |
+
"GROUP_SIZE_M": 16,
|
| 55 |
+
"num_warps": 4,
|
| 56 |
+
"num_stages": 3
|
| 57 |
+
},
|
| 58 |
+
"48": {
|
| 59 |
+
"BLOCK_SIZE_M": 64,
|
| 60 |
+
"BLOCK_SIZE_N": 64,
|
| 61 |
+
"BLOCK_SIZE_K": 128,
|
| 62 |
+
"GROUP_SIZE_M": 32,
|
| 63 |
+
"num_warps": 4,
|
| 64 |
+
"num_stages": 3
|
| 65 |
+
},
|
| 66 |
+
"64": {
|
| 67 |
+
"BLOCK_SIZE_M": 64,
|
| 68 |
+
"BLOCK_SIZE_N": 64,
|
| 69 |
+
"BLOCK_SIZE_K": 128,
|
| 70 |
+
"GROUP_SIZE_M": 32,
|
| 71 |
+
"num_warps": 4,
|
| 72 |
+
"num_stages": 3
|
| 73 |
+
},
|
| 74 |
+
"96": {
|
| 75 |
+
"BLOCK_SIZE_M": 64,
|
| 76 |
+
"BLOCK_SIZE_N": 64,
|
| 77 |
+
"BLOCK_SIZE_K": 128,
|
| 78 |
+
"GROUP_SIZE_M": 64,
|
| 79 |
+
"num_warps": 4,
|
| 80 |
+
"num_stages": 3
|
| 81 |
+
},
|
| 82 |
+
"128": {
|
| 83 |
+
"BLOCK_SIZE_M": 64,
|
| 84 |
+
"BLOCK_SIZE_N": 64,
|
| 85 |
+
"BLOCK_SIZE_K": 128,
|
| 86 |
+
"GROUP_SIZE_M": 64,
|
| 87 |
+
"num_warps": 4,
|
| 88 |
+
"num_stages": 3
|
| 89 |
+
},
|
| 90 |
+
"256": {
|
| 91 |
+
"BLOCK_SIZE_M": 64,
|
| 92 |
+
"BLOCK_SIZE_N": 128,
|
| 93 |
+
"BLOCK_SIZE_K": 128,
|
| 94 |
+
"GROUP_SIZE_M": 32,
|
| 95 |
+
"num_warps": 4,
|
| 96 |
+
"num_stages": 3
|
| 97 |
+
},
|
| 98 |
+
"512": {
|
| 99 |
+
"BLOCK_SIZE_M": 64,
|
| 100 |
+
"BLOCK_SIZE_N": 128,
|
| 101 |
+
"BLOCK_SIZE_K": 128,
|
| 102 |
+
"GROUP_SIZE_M": 64,
|
| 103 |
+
"num_warps": 4,
|
| 104 |
+
"num_stages": 3
|
| 105 |
+
},
|
| 106 |
+
"1024": {
|
| 107 |
+
"BLOCK_SIZE_M": 64,
|
| 108 |
+
"BLOCK_SIZE_N": 128,
|
| 109 |
+
"BLOCK_SIZE_K": 128,
|
| 110 |
+
"GROUP_SIZE_M": 64,
|
| 111 |
+
"num_warps": 4,
|
| 112 |
+
"num_stages": 3
|
| 113 |
+
},
|
| 114 |
+
"1536": {
|
| 115 |
+
"BLOCK_SIZE_M": 64,
|
| 116 |
+
"BLOCK_SIZE_N": 128,
|
| 117 |
+
"BLOCK_SIZE_K": 128,
|
| 118 |
+
"GROUP_SIZE_M": 64,
|
| 119 |
+
"num_warps": 4,
|
| 120 |
+
"num_stages": 3
|
| 121 |
+
},
|
| 122 |
+
"2048": {
|
| 123 |
+
"BLOCK_SIZE_M": 64,
|
| 124 |
+
"BLOCK_SIZE_N": 128,
|
| 125 |
+
"BLOCK_SIZE_K": 128,
|
| 126 |
+
"GROUP_SIZE_M": 16,
|
| 127 |
+
"num_warps": 4,
|
| 128 |
+
"num_stages": 3
|
| 129 |
+
},
|
| 130 |
+
"3072": {
|
| 131 |
+
"BLOCK_SIZE_M": 64,
|
| 132 |
+
"BLOCK_SIZE_N": 128,
|
| 133 |
+
"BLOCK_SIZE_K": 128,
|
| 134 |
+
"GROUP_SIZE_M": 16,
|
| 135 |
+
"num_warps": 4,
|
| 136 |
+
"num_stages": 3
|
| 137 |
+
},
|
| 138 |
+
"4096": {
|
| 139 |
+
"BLOCK_SIZE_M": 64,
|
| 140 |
+
"BLOCK_SIZE_N": 128,
|
| 141 |
+
"BLOCK_SIZE_K": 128,
|
| 142 |
+
"GROUP_SIZE_M": 16,
|
| 143 |
+
"num_warps": 4,
|
| 144 |
+
"num_stages": 3
|
| 145 |
+
}
|
| 146 |
+
}
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"1": {
|
| 3 |
+
"BLOCK_SIZE_M": 64,
|
| 4 |
+
"BLOCK_SIZE_N": 128,
|
| 5 |
+
"BLOCK_SIZE_K": 128,
|
| 6 |
+
"GROUP_SIZE_M": 16,
|
| 7 |
+
"num_warps": 8,
|
| 8 |
+
"num_stages": 3
|
| 9 |
+
},
|
| 10 |
+
"2": {
|
| 11 |
+
"BLOCK_SIZE_M": 64,
|
| 12 |
+
"BLOCK_SIZE_N": 64,
|
| 13 |
+
"BLOCK_SIZE_K": 128,
|
| 14 |
+
"GROUP_SIZE_M": 32,
|
| 15 |
+
"num_warps": 4,
|
| 16 |
+
"num_stages": 5
|
| 17 |
+
},
|
| 18 |
+
"4": {
|
| 19 |
+
"BLOCK_SIZE_M": 64,
|
| 20 |
+
"BLOCK_SIZE_N": 64,
|
| 21 |
+
"BLOCK_SIZE_K": 128,
|
| 22 |
+
"GROUP_SIZE_M": 16,
|
| 23 |
+
"num_warps": 4,
|
| 24 |
+
"num_stages": 5
|
| 25 |
+
},
|
| 26 |
+
"8": {
|
| 27 |
+
"BLOCK_SIZE_M": 64,
|
| 28 |
+
"BLOCK_SIZE_N": 64,
|
| 29 |
+
"BLOCK_SIZE_K": 128,
|
| 30 |
+
"GROUP_SIZE_M": 32,
|
| 31 |
+
"num_warps": 4,
|
| 32 |
+
"num_stages": 5
|
| 33 |
+
},
|
| 34 |
+
"16": {
|
| 35 |
+
"BLOCK_SIZE_M": 64,
|
| 36 |
+
"BLOCK_SIZE_N": 64,
|
| 37 |
+
"BLOCK_SIZE_K": 128,
|
| 38 |
+
"GROUP_SIZE_M": 64,
|
| 39 |
+
"num_warps": 4,
|
| 40 |
+
"num_stages": 3
|
| 41 |
+
},
|
| 42 |
+
"24": {
|
| 43 |
+
"BLOCK_SIZE_M": 64,
|
| 44 |
+
"BLOCK_SIZE_N": 64,
|
| 45 |
+
"BLOCK_SIZE_K": 128,
|
| 46 |
+
"GROUP_SIZE_M": 1,
|
| 47 |
+
"num_warps": 4,
|
| 48 |
+
"num_stages": 3
|
| 49 |
+
},
|
| 50 |
+
"32": {
|
| 51 |
+
"BLOCK_SIZE_M": 64,
|
| 52 |
+
"BLOCK_SIZE_N": 64,
|
| 53 |
+
"BLOCK_SIZE_K": 128,
|
| 54 |
+
"GROUP_SIZE_M": 1,
|
| 55 |
+
"num_warps": 4,
|
| 56 |
+
"num_stages": 3
|
| 57 |
+
},
|
| 58 |
+
"48": {
|
| 59 |
+
"BLOCK_SIZE_M": 64,
|
| 60 |
+
"BLOCK_SIZE_N": 64,
|
| 61 |
+
"BLOCK_SIZE_K": 128,
|
| 62 |
+
"GROUP_SIZE_M": 16,
|
| 63 |
+
"num_warps": 4,
|
| 64 |
+
"num_stages": 3
|
| 65 |
+
},
|
| 66 |
+
"64": {
|
| 67 |
+
"BLOCK_SIZE_M": 64,
|
| 68 |
+
"BLOCK_SIZE_N": 64,
|
| 69 |
+
"BLOCK_SIZE_K": 128,
|
| 70 |
+
"GROUP_SIZE_M": 1,
|
| 71 |
+
"num_warps": 4,
|
| 72 |
+
"num_stages": 3
|
| 73 |
+
},
|
| 74 |
+
"96": {
|
| 75 |
+
"BLOCK_SIZE_M": 64,
|
| 76 |
+
"BLOCK_SIZE_N": 64,
|
| 77 |
+
"BLOCK_SIZE_K": 128,
|
| 78 |
+
"GROUP_SIZE_M": 32,
|
| 79 |
+
"num_warps": 4,
|
| 80 |
+
"num_stages": 3
|
| 81 |
+
},
|
| 82 |
+
"128": {
|
| 83 |
+
"BLOCK_SIZE_M": 64,
|
| 84 |
+
"BLOCK_SIZE_N": 64,
|
| 85 |
+
"BLOCK_SIZE_K": 128,
|
| 86 |
+
"GROUP_SIZE_M": 64,
|
| 87 |
+
"num_warps": 4,
|
| 88 |
+
"num_stages": 3
|
| 89 |
+
},
|
| 90 |
+
"256": {
|
| 91 |
+
"BLOCK_SIZE_M": 64,
|
| 92 |
+
"BLOCK_SIZE_N": 128,
|
| 93 |
+
"BLOCK_SIZE_K": 128,
|
| 94 |
+
"GROUP_SIZE_M": 32,
|
| 95 |
+
"num_warps": 4,
|
| 96 |
+
"num_stages": 3
|
| 97 |
+
},
|
| 98 |
+
"512": {
|
| 99 |
+
"BLOCK_SIZE_M": 64,
|
| 100 |
+
"BLOCK_SIZE_N": 128,
|
| 101 |
+
"BLOCK_SIZE_K": 128,
|
| 102 |
+
"GROUP_SIZE_M": 64,
|
| 103 |
+
"num_warps": 4,
|
| 104 |
+
"num_stages": 3
|
| 105 |
+
},
|
| 106 |
+
"1024": {
|
| 107 |
+
"BLOCK_SIZE_M": 64,
|
| 108 |
+
"BLOCK_SIZE_N": 128,
|
| 109 |
+
"BLOCK_SIZE_K": 128,
|
| 110 |
+
"GROUP_SIZE_M": 64,
|
| 111 |
+
"num_warps": 4,
|
| 112 |
+
"num_stages": 3
|
| 113 |
+
},
|
| 114 |
+
"1536": {
|
| 115 |
+
"BLOCK_SIZE_M": 64,
|
| 116 |
+
"BLOCK_SIZE_N": 128,
|
| 117 |
+
"BLOCK_SIZE_K": 128,
|
| 118 |
+
"GROUP_SIZE_M": 64,
|
| 119 |
+
"num_warps": 4,
|
| 120 |
+
"num_stages": 3
|
| 121 |
+
},
|
| 122 |
+
"2048": {
|
| 123 |
+
"BLOCK_SIZE_M": 64,
|
| 124 |
+
"BLOCK_SIZE_N": 128,
|
| 125 |
+
"BLOCK_SIZE_K": 128,
|
| 126 |
+
"GROUP_SIZE_M": 64,
|
| 127 |
+
"num_warps": 4,
|
| 128 |
+
"num_stages": 3
|
| 129 |
+
},
|
| 130 |
+
"3072": {
|
| 131 |
+
"BLOCK_SIZE_M": 64,
|
| 132 |
+
"BLOCK_SIZE_N": 128,
|
| 133 |
+
"BLOCK_SIZE_K": 128,
|
| 134 |
+
"GROUP_SIZE_M": 64,
|
| 135 |
+
"num_warps": 4,
|
| 136 |
+
"num_stages": 3
|
| 137 |
+
},
|
| 138 |
+
"4096": {
|
| 139 |
+
"BLOCK_SIZE_M": 64,
|
| 140 |
+
"BLOCK_SIZE_N": 128,
|
| 141 |
+
"BLOCK_SIZE_K": 128,
|
| 142 |
+
"GROUP_SIZE_M": 64,
|
| 143 |
+
"num_warps": 4,
|
| 144 |
+
"num_stages": 3
|
| 145 |
+
}
|
| 146 |
+
}
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"1": {
|
| 3 |
+
"BLOCK_SIZE_M": 16,
|
| 4 |
+
"BLOCK_SIZE_N": 32,
|
| 5 |
+
"BLOCK_SIZE_K": 64,
|
| 6 |
+
"GROUP_SIZE_M": 64,
|
| 7 |
+
"num_warps": 4,
|
| 8 |
+
"num_stages": 3
|
| 9 |
+
},
|
| 10 |
+
"2": {
|
| 11 |
+
"BLOCK_SIZE_M": 16,
|
| 12 |
+
"BLOCK_SIZE_N": 32,
|
| 13 |
+
"BLOCK_SIZE_K": 64,
|
| 14 |
+
"GROUP_SIZE_M": 16,
|
| 15 |
+
"num_warps": 4,
|
| 16 |
+
"num_stages": 4
|
| 17 |
+
},
|
| 18 |
+
"4": {
|
| 19 |
+
"BLOCK_SIZE_M": 32,
|
| 20 |
+
"BLOCK_SIZE_N": 32,
|
| 21 |
+
"BLOCK_SIZE_K": 64,
|
| 22 |
+
"GROUP_SIZE_M": 32,
|
| 23 |
+
"num_warps": 4,
|
| 24 |
+
"num_stages": 4
|
| 25 |
+
},
|
| 26 |
+
"8": {
|
| 27 |
+
"BLOCK_SIZE_M": 32,
|
| 28 |
+
"BLOCK_SIZE_N": 32,
|
| 29 |
+
"BLOCK_SIZE_K": 64,
|
| 30 |
+
"GROUP_SIZE_M": 16,
|
| 31 |
+
"num_warps": 4,
|
| 32 |
+
"num_stages": 4
|
| 33 |
+
},
|
| 34 |
+
"16": {
|
| 35 |
+
"BLOCK_SIZE_M": 16,
|
| 36 |
+
"BLOCK_SIZE_N": 32,
|
| 37 |
+
"BLOCK_SIZE_K": 64,
|
| 38 |
+
"GROUP_SIZE_M": 64,
|
| 39 |
+
"num_warps": 8,
|
| 40 |
+
"num_stages": 2
|
| 41 |
+
},
|
| 42 |
+
"24": {
|
| 43 |
+
"BLOCK_SIZE_M": 16,
|
| 44 |
+
"BLOCK_SIZE_N": 64,
|
| 45 |
+
"BLOCK_SIZE_K": 64,
|
| 46 |
+
"GROUP_SIZE_M": 1,
|
| 47 |
+
"num_warps": 8,
|
| 48 |
+
"num_stages": 4
|
| 49 |
+
},
|
| 50 |
+
"32": {
|
| 51 |
+
"BLOCK_SIZE_M": 32,
|
| 52 |
+
"BLOCK_SIZE_N": 64,
|
| 53 |
+
"BLOCK_SIZE_K": 128,
|
| 54 |
+
"GROUP_SIZE_M": 32,
|
| 55 |
+
"num_warps": 8,
|
| 56 |
+
"num_stages": 2
|
| 57 |
+
},
|
| 58 |
+
"48": {
|
| 59 |
+
"BLOCK_SIZE_M": 32,
|
| 60 |
+
"BLOCK_SIZE_N": 64,
|
| 61 |
+
"BLOCK_SIZE_K": 128,
|
| 62 |
+
"GROUP_SIZE_M": 32,
|
| 63 |
+
"num_warps": 4,
|
| 64 |
+
"num_stages": 4
|
| 65 |
+
},
|
| 66 |
+
"64": {
|
| 67 |
+
"BLOCK_SIZE_M": 64,
|
| 68 |
+
"BLOCK_SIZE_N": 32,
|
| 69 |
+
"BLOCK_SIZE_K": 128,
|
| 70 |
+
"GROUP_SIZE_M": 32,
|
| 71 |
+
"num_warps": 4,
|
| 72 |
+
"num_stages": 2
|
| 73 |
+
},
|
| 74 |
+
"96": {
|
| 75 |
+
"BLOCK_SIZE_M": 64,
|
| 76 |
+
"BLOCK_SIZE_N": 64,
|
| 77 |
+
"BLOCK_SIZE_K": 128,
|
| 78 |
+
"GROUP_SIZE_M": 64,
|
| 79 |
+
"num_warps": 8,
|
| 80 |
+
"num_stages": 2
|
| 81 |
+
},
|
| 82 |
+
"128": {
|
| 83 |
+
"BLOCK_SIZE_M": 64,
|
| 84 |
+
"BLOCK_SIZE_N": 64,
|
| 85 |
+
"BLOCK_SIZE_K": 128,
|
| 86 |
+
"GROUP_SIZE_M": 1,
|
| 87 |
+
"num_warps": 4,
|
| 88 |
+
"num_stages": 4
|
| 89 |
+
},
|
| 90 |
+
"256": {
|
| 91 |
+
"BLOCK_SIZE_M": 128,
|
| 92 |
+
"BLOCK_SIZE_N": 64,
|
| 93 |
+
"BLOCK_SIZE_K": 128,
|
| 94 |
+
"GROUP_SIZE_M": 16,
|
| 95 |
+
"num_warps": 8,
|
| 96 |
+
"num_stages": 2
|
| 97 |
+
},
|
| 98 |
+
"512": {
|
| 99 |
+
"BLOCK_SIZE_M": 128,
|
| 100 |
+
"BLOCK_SIZE_N": 128,
|
| 101 |
+
"BLOCK_SIZE_K": 128,
|
| 102 |
+
"GROUP_SIZE_M": 16,
|
| 103 |
+
"num_warps": 8,
|
| 104 |
+
"num_stages": 2
|
| 105 |
+
},
|
| 106 |
+
"1024": {
|
| 107 |
+
"BLOCK_SIZE_M": 128,
|
| 108 |
+
"BLOCK_SIZE_N": 128,
|
| 109 |
+
"BLOCK_SIZE_K": 128,
|
| 110 |
+
"GROUP_SIZE_M": 64,
|
| 111 |
+
"num_warps": 8,
|
| 112 |
+
"num_stages": 2
|
| 113 |
+
},
|
| 114 |
+
"1536": {
|
| 115 |
+
"BLOCK_SIZE_M": 128,
|
| 116 |
+
"BLOCK_SIZE_N": 128,
|
| 117 |
+
"BLOCK_SIZE_K": 128,
|
| 118 |
+
"GROUP_SIZE_M": 1,
|
| 119 |
+
"num_warps": 8,
|
| 120 |
+
"num_stages": 2
|
| 121 |
+
},
|
| 122 |
+
"2048": {
|
| 123 |
+
"BLOCK_SIZE_M": 64,
|
| 124 |
+
"BLOCK_SIZE_N": 128,
|
| 125 |
+
"BLOCK_SIZE_K": 128,
|
| 126 |
+
"GROUP_SIZE_M": 16,
|
| 127 |
+
"num_warps": 4,
|
| 128 |
+
"num_stages": 2
|
| 129 |
+
},
|
| 130 |
+
"3072": {
|
| 131 |
+
"BLOCK_SIZE_M": 64,
|
| 132 |
+
"BLOCK_SIZE_N": 128,
|
| 133 |
+
"BLOCK_SIZE_K": 128,
|
| 134 |
+
"GROUP_SIZE_M": 1,
|
| 135 |
+
"num_warps": 4,
|
| 136 |
+
"num_stages": 3
|
| 137 |
+
},
|
| 138 |
+
"4096": {
|
| 139 |
+
"BLOCK_SIZE_M": 64,
|
| 140 |
+
"BLOCK_SIZE_N": 128,
|
| 141 |
+
"BLOCK_SIZE_K": 128,
|
| 142 |
+
"GROUP_SIZE_M": 16,
|
| 143 |
+
"num_warps": 4,
|
| 144 |
+
"num_stages": 2
|
| 145 |
+
}
|
| 146 |
+
}
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"1": {
|
| 3 |
+
"BLOCK_SIZE_M": 64,
|
| 4 |
+
"BLOCK_SIZE_N": 64,
|
| 5 |
+
"BLOCK_SIZE_K": 128,
|
| 6 |
+
"GROUP_SIZE_M": 1,
|
| 7 |
+
"num_warps": 4,
|
| 8 |
+
"num_stages": 5
|
| 9 |
+
},
|
| 10 |
+
"2": {
|
| 11 |
+
"BLOCK_SIZE_M": 64,
|
| 12 |
+
"BLOCK_SIZE_N": 64,
|
| 13 |
+
"BLOCK_SIZE_K": 128,
|
| 14 |
+
"GROUP_SIZE_M": 64,
|
| 15 |
+
"num_warps": 4,
|
| 16 |
+
"num_stages": 5
|
| 17 |
+
},
|
| 18 |
+
"4": {
|
| 19 |
+
"BLOCK_SIZE_M": 64,
|
| 20 |
+
"BLOCK_SIZE_N": 64,
|
| 21 |
+
"BLOCK_SIZE_K": 128,
|
| 22 |
+
"GROUP_SIZE_M": 64,
|
| 23 |
+
"num_warps": 4,
|
| 24 |
+
"num_stages": 4
|
| 25 |
+
},
|
| 26 |
+
"8": {
|
| 27 |
+
"BLOCK_SIZE_M": 64,
|
| 28 |
+
"BLOCK_SIZE_N": 32,
|
| 29 |
+
"BLOCK_SIZE_K": 128,
|
| 30 |
+
"GROUP_SIZE_M": 16,
|
| 31 |
+
"num_warps": 4,
|
| 32 |
+
"num_stages": 4
|
| 33 |
+
},
|
| 34 |
+
"16": {
|
| 35 |
+
"BLOCK_SIZE_M": 64,
|
| 36 |
+
"BLOCK_SIZE_N": 32,
|
| 37 |
+
"BLOCK_SIZE_K": 128,
|
| 38 |
+
"GROUP_SIZE_M": 16,
|
| 39 |
+
"num_warps": 4,
|
| 40 |
+
"num_stages": 4
|
| 41 |
+
},
|
| 42 |
+
"24": {
|
| 43 |
+
"BLOCK_SIZE_M": 64,
|
| 44 |
+
"BLOCK_SIZE_N": 32,
|
| 45 |
+
"BLOCK_SIZE_K": 128,
|
| 46 |
+
"GROUP_SIZE_M": 32,
|
| 47 |
+
"num_warps": 4,
|
| 48 |
+
"num_stages": 4
|
| 49 |
+
},
|
| 50 |
+
"32": {
|
| 51 |
+
"BLOCK_SIZE_M": 64,
|
| 52 |
+
"BLOCK_SIZE_N": 32,
|
| 53 |
+
"BLOCK_SIZE_K": 128,
|
| 54 |
+
"GROUP_SIZE_M": 32,
|
| 55 |
+
"num_warps": 4,
|
| 56 |
+
"num_stages": 4
|
| 57 |
+
},
|
| 58 |
+
"48": {
|
| 59 |
+
"BLOCK_SIZE_M": 64,
|
| 60 |
+
"BLOCK_SIZE_N": 32,
|
| 61 |
+
"BLOCK_SIZE_K": 128,
|
| 62 |
+
"GROUP_SIZE_M": 16,
|
| 63 |
+
"num_warps": 4,
|
| 64 |
+
"num_stages": 4
|
| 65 |
+
},
|
| 66 |
+
"64": {
|
| 67 |
+
"BLOCK_SIZE_M": 64,
|
| 68 |
+
"BLOCK_SIZE_N": 32,
|
| 69 |
+
"BLOCK_SIZE_K": 128,
|
| 70 |
+
"GROUP_SIZE_M": 1,
|
| 71 |
+
"num_warps": 4,
|
| 72 |
+
"num_stages": 4
|
| 73 |
+
},
|
| 74 |
+
"96": {
|
| 75 |
+
"BLOCK_SIZE_M": 64,
|
| 76 |
+
"BLOCK_SIZE_N": 32,
|
| 77 |
+
"BLOCK_SIZE_K": 128,
|
| 78 |
+
"GROUP_SIZE_M": 16,
|
| 79 |
+
"num_warps": 4,
|
| 80 |
+
"num_stages": 3
|
| 81 |
+
},
|
| 82 |
+
"128": {
|
| 83 |
+
"BLOCK_SIZE_M": 64,
|
| 84 |
+
"BLOCK_SIZE_N": 64,
|
| 85 |
+
"BLOCK_SIZE_K": 128,
|
| 86 |
+
"GROUP_SIZE_M": 16,
|
| 87 |
+
"num_warps": 4,
|
| 88 |
+
"num_stages": 4
|
| 89 |
+
},
|
| 90 |
+
"256": {
|
| 91 |
+
"BLOCK_SIZE_M": 64,
|
| 92 |
+
"BLOCK_SIZE_N": 32,
|
| 93 |
+
"BLOCK_SIZE_K": 128,
|
| 94 |
+
"GROUP_SIZE_M": 64,
|
| 95 |
+
"num_warps": 4,
|
| 96 |
+
"num_stages": 3
|
| 97 |
+
},
|
| 98 |
+
"512": {
|
| 99 |
+
"BLOCK_SIZE_M": 64,
|
| 100 |
+
"BLOCK_SIZE_N": 128,
|
| 101 |
+
"BLOCK_SIZE_K": 128,
|
| 102 |
+
"GROUP_SIZE_M": 1,
|
| 103 |
+
"num_warps": 4,
|
| 104 |
+
"num_stages": 3
|
| 105 |
+
},
|
| 106 |
+
"1024": {
|
| 107 |
+
"BLOCK_SIZE_M": 64,
|
| 108 |
+
"BLOCK_SIZE_N": 128,
|
| 109 |
+
"BLOCK_SIZE_K": 128,
|
| 110 |
+
"GROUP_SIZE_M": 64,
|
| 111 |
+
"num_warps": 4,
|
| 112 |
+
"num_stages": 3
|
| 113 |
+
},
|
| 114 |
+
"1536": {
|
| 115 |
+
"BLOCK_SIZE_M": 64,
|
| 116 |
+
"BLOCK_SIZE_N": 128,
|
| 117 |
+
"BLOCK_SIZE_K": 128,
|
| 118 |
+
"GROUP_SIZE_M": 1,
|
| 119 |
+
"num_warps": 4,
|
| 120 |
+
"num_stages": 3
|
| 121 |
+
},
|
| 122 |
+
"2048": {
|
| 123 |
+
"BLOCK_SIZE_M": 64,
|
| 124 |
+
"BLOCK_SIZE_N": 128,
|
| 125 |
+
"BLOCK_SIZE_K": 128,
|
| 126 |
+
"GROUP_SIZE_M": 16,
|
| 127 |
+
"num_warps": 4,
|
| 128 |
+
"num_stages": 3
|
| 129 |
+
},
|
| 130 |
+
"3072": {
|
| 131 |
+
"BLOCK_SIZE_M": 64,
|
| 132 |
+
"BLOCK_SIZE_N": 128,
|
| 133 |
+
"BLOCK_SIZE_K": 128,
|
| 134 |
+
"GROUP_SIZE_M": 16,
|
| 135 |
+
"num_warps": 4,
|
| 136 |
+
"num_stages": 3
|
| 137 |
+
},
|
| 138 |
+
"4096": {
|
| 139 |
+
"BLOCK_SIZE_M": 64,
|
| 140 |
+
"BLOCK_SIZE_N": 128,
|
| 141 |
+
"BLOCK_SIZE_K": 128,
|
| 142 |
+
"GROUP_SIZE_M": 16,
|
| 143 |
+
"num_warps": 4,
|
| 144 |
+
"num_stages": 3
|
| 145 |
+
}
|
| 146 |
+
}
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"1": {
|
| 3 |
+
"BLOCK_SIZE_M": 64,
|
| 4 |
+
"BLOCK_SIZE_N": 64,
|
| 5 |
+
"BLOCK_SIZE_K": 128,
|
| 6 |
+
"GROUP_SIZE_M": 16,
|
| 7 |
+
"num_warps": 4,
|
| 8 |
+
"num_stages": 5
|
| 9 |
+
},
|
| 10 |
+
"2": {
|
| 11 |
+
"BLOCK_SIZE_M": 64,
|
| 12 |
+
"BLOCK_SIZE_N": 64,
|
| 13 |
+
"BLOCK_SIZE_K": 128,
|
| 14 |
+
"GROUP_SIZE_M": 1,
|
| 15 |
+
"num_warps": 4,
|
| 16 |
+
"num_stages": 5
|
| 17 |
+
},
|
| 18 |
+
"4": {
|
| 19 |
+
"BLOCK_SIZE_M": 64,
|
| 20 |
+
"BLOCK_SIZE_N": 64,
|
| 21 |
+
"BLOCK_SIZE_K": 128,
|
| 22 |
+
"GROUP_SIZE_M": 1,
|
| 23 |
+
"num_warps": 4,
|
| 24 |
+
"num_stages": 4
|
| 25 |
+
},
|
| 26 |
+
"8": {
|
| 27 |
+
"BLOCK_SIZE_M": 64,
|
| 28 |
+
"BLOCK_SIZE_N": 32,
|
| 29 |
+
"BLOCK_SIZE_K": 128,
|
| 30 |
+
"GROUP_SIZE_M": 1,
|
| 31 |
+
"num_warps": 4,
|
| 32 |
+
"num_stages": 4
|
| 33 |
+
},
|
| 34 |
+
"16": {
|
| 35 |
+
"BLOCK_SIZE_M": 64,
|
| 36 |
+
"BLOCK_SIZE_N": 32,
|
| 37 |
+
"BLOCK_SIZE_K": 128,
|
| 38 |
+
"GROUP_SIZE_M": 1,
|
| 39 |
+
"num_warps": 4,
|
| 40 |
+
"num_stages": 4
|
| 41 |
+
},
|
| 42 |
+
"24": {
|
| 43 |
+
"BLOCK_SIZE_M": 64,
|
| 44 |
+
"BLOCK_SIZE_N": 32,
|
| 45 |
+
"BLOCK_SIZE_K": 128,
|
| 46 |
+
"GROUP_SIZE_M": 1,
|
| 47 |
+
"num_warps": 4,
|
| 48 |
+
"num_stages": 4
|
| 49 |
+
},
|
| 50 |
+
"32": {
|
| 51 |
+
"BLOCK_SIZE_M": 64,
|
| 52 |
+
"BLOCK_SIZE_N": 32,
|
| 53 |
+
"BLOCK_SIZE_K": 128,
|
| 54 |
+
"GROUP_SIZE_M": 16,
|
| 55 |
+
"num_warps": 4,
|
| 56 |
+
"num_stages": 4
|
| 57 |
+
},
|
| 58 |
+
"48": {
|
| 59 |
+
"BLOCK_SIZE_M": 64,
|
| 60 |
+
"BLOCK_SIZE_N": 32,
|
| 61 |
+
"BLOCK_SIZE_K": 128,
|
| 62 |
+
"GROUP_SIZE_M": 64,
|
| 63 |
+
"num_warps": 4,
|
| 64 |
+
"num_stages": 4
|
| 65 |
+
},
|
| 66 |
+
"64": {
|
| 67 |
+
"BLOCK_SIZE_M": 64,
|
| 68 |
+
"BLOCK_SIZE_N": 32,
|
| 69 |
+
"BLOCK_SIZE_K": 128,
|
| 70 |
+
"GROUP_SIZE_M": 1,
|
| 71 |
+
"num_warps": 4,
|
| 72 |
+
"num_stages": 4
|
| 73 |
+
},
|
| 74 |
+
"96": {
|
| 75 |
+
"BLOCK_SIZE_M": 64,
|
| 76 |
+
"BLOCK_SIZE_N": 64,
|
| 77 |
+
"BLOCK_SIZE_K": 128,
|
| 78 |
+
"GROUP_SIZE_M": 16,
|
| 79 |
+
"num_warps": 4,
|
| 80 |
+
"num_stages": 4
|
| 81 |
+
},
|
| 82 |
+
"128": {
|
| 83 |
+
"BLOCK_SIZE_M": 64,
|
| 84 |
+
"BLOCK_SIZE_N": 64,
|
| 85 |
+
"BLOCK_SIZE_K": 128,
|
| 86 |
+
"GROUP_SIZE_M": 16,
|
| 87 |
+
"num_warps": 4,
|
| 88 |
+
"num_stages": 4
|
| 89 |
+
},
|
| 90 |
+
"256": {
|
| 91 |
+
"BLOCK_SIZE_M": 64,
|
| 92 |
+
"BLOCK_SIZE_N": 64,
|
| 93 |
+
"BLOCK_SIZE_K": 128,
|
| 94 |
+
"GROUP_SIZE_M": 64,
|
| 95 |
+
"num_warps": 4,
|
| 96 |
+
"num_stages": 3
|
| 97 |
+
},
|
| 98 |
+
"512": {
|
| 99 |
+
"BLOCK_SIZE_M": 64,
|
| 100 |
+
"BLOCK_SIZE_N": 128,
|
| 101 |
+
"BLOCK_SIZE_K": 128,
|
| 102 |
+
"GROUP_SIZE_M": 16,
|
| 103 |
+
"num_warps": 4,
|
| 104 |
+
"num_stages": 3
|
| 105 |
+
},
|
| 106 |
+
"1024": {
|
| 107 |
+
"BLOCK_SIZE_M": 64,
|
| 108 |
+
"BLOCK_SIZE_N": 128,
|
| 109 |
+
"BLOCK_SIZE_K": 128,
|
| 110 |
+
"GROUP_SIZE_M": 64,
|
| 111 |
+
"num_warps": 4,
|
| 112 |
+
"num_stages": 3
|
| 113 |
+
},
|
| 114 |
+
"1536": {
|
| 115 |
+
"BLOCK_SIZE_M": 64,
|
| 116 |
+
"BLOCK_SIZE_N": 128,
|
| 117 |
+
"BLOCK_SIZE_K": 128,
|
| 118 |
+
"GROUP_SIZE_M": 64,
|
| 119 |
+
"num_warps": 4,
|
| 120 |
+
"num_stages": 3
|
| 121 |
+
},
|
| 122 |
+
"2048": {
|
| 123 |
+
"BLOCK_SIZE_M": 64,
|
| 124 |
+
"BLOCK_SIZE_N": 128,
|
| 125 |
+
"BLOCK_SIZE_K": 128,
|
| 126 |
+
"GROUP_SIZE_M": 32,
|
| 127 |
+
"num_warps": 4,
|
| 128 |
+
"num_stages": 3
|
| 129 |
+
},
|
| 130 |
+
"3072": {
|
| 131 |
+
"BLOCK_SIZE_M": 64,
|
| 132 |
+
"BLOCK_SIZE_N": 128,
|
| 133 |
+
"BLOCK_SIZE_K": 128,
|
| 134 |
+
"GROUP_SIZE_M": 16,
|
| 135 |
+
"num_warps": 4,
|
| 136 |
+
"num_stages": 3
|
| 137 |
+
},
|
| 138 |
+
"4096": {
|
| 139 |
+
"BLOCK_SIZE_M": 64,
|
| 140 |
+
"BLOCK_SIZE_N": 128,
|
| 141 |
+
"BLOCK_SIZE_K": 128,
|
| 142 |
+
"GROUP_SIZE_M": 64,
|
| 143 |
+
"num_warps": 4,
|
| 144 |
+
"num_stages": 3
|
| 145 |
+
}
|
| 146 |
+
}
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"1": {
|
| 3 |
+
"BLOCK_SIZE_M": 16,
|
| 4 |
+
"BLOCK_SIZE_N": 32,
|
| 5 |
+
"BLOCK_SIZE_K": 128,
|
| 6 |
+
"GROUP_SIZE_M": 1,
|
| 7 |
+
"num_warps": 4,
|
| 8 |
+
"num_stages": 4
|
| 9 |
+
},
|
| 10 |
+
"2": {
|
| 11 |
+
"BLOCK_SIZE_M": 16,
|
| 12 |
+
"BLOCK_SIZE_N": 32,
|
| 13 |
+
"BLOCK_SIZE_K": 128,
|
| 14 |
+
"GROUP_SIZE_M": 32,
|
| 15 |
+
"num_warps": 4,
|
| 16 |
+
"num_stages": 3
|
| 17 |
+
},
|
| 18 |
+
"4": {
|
| 19 |
+
"BLOCK_SIZE_M": 16,
|
| 20 |
+
"BLOCK_SIZE_N": 32,
|
| 21 |
+
"BLOCK_SIZE_K": 128,
|
| 22 |
+
"GROUP_SIZE_M": 64,
|
| 23 |
+
"num_warps": 4,
|
| 24 |
+
"num_stages": 4
|
| 25 |
+
},
|
| 26 |
+
"8": {
|
| 27 |
+
"BLOCK_SIZE_M": 16,
|
| 28 |
+
"BLOCK_SIZE_N": 32,
|
| 29 |
+
"BLOCK_SIZE_K": 128,
|
| 30 |
+
"GROUP_SIZE_M": 64,
|
| 31 |
+
"num_warps": 4,
|
| 32 |
+
"num_stages": 3
|
| 33 |
+
},
|
| 34 |
+
"16": {
|
| 35 |
+
"BLOCK_SIZE_M": 16,
|
| 36 |
+
"BLOCK_SIZE_N": 32,
|
| 37 |
+
"BLOCK_SIZE_K": 128,
|
| 38 |
+
"GROUP_SIZE_M": 16,
|
| 39 |
+
"num_warps": 4,
|
| 40 |
+
"num_stages": 5
|
| 41 |
+
},
|
| 42 |
+
"24": {
|
| 43 |
+
"BLOCK_SIZE_M": 32,
|
| 44 |
+
"BLOCK_SIZE_N": 32,
|
| 45 |
+
"BLOCK_SIZE_K": 128,
|
| 46 |
+
"GROUP_SIZE_M": 16,
|
| 47 |
+
"num_warps": 8,
|
| 48 |
+
"num_stages": 3
|
| 49 |
+
},
|
| 50 |
+
"32": {
|
| 51 |
+
"BLOCK_SIZE_M": 32,
|
| 52 |
+
"BLOCK_SIZE_N": 32,
|
| 53 |
+
"BLOCK_SIZE_K": 128,
|
| 54 |
+
"GROUP_SIZE_M": 64,
|
| 55 |
+
"num_warps": 8,
|
| 56 |
+
"num_stages": 4
|
| 57 |
+
},
|
| 58 |
+
"48": {
|
| 59 |
+
"BLOCK_SIZE_M": 32,
|
| 60 |
+
"BLOCK_SIZE_N": 64,
|
| 61 |
+
"BLOCK_SIZE_K": 128,
|
| 62 |
+
"GROUP_SIZE_M": 1,
|
| 63 |
+
"num_warps": 8,
|
| 64 |
+
"num_stages": 4
|
| 65 |
+
},
|
| 66 |
+
"64": {
|
| 67 |
+
"BLOCK_SIZE_M": 32,
|
| 68 |
+
"BLOCK_SIZE_N": 64,
|
| 69 |
+
"BLOCK_SIZE_K": 128,
|
| 70 |
+
"GROUP_SIZE_M": 16,
|
| 71 |
+
"num_warps": 8,
|
| 72 |
+
"num_stages": 5
|
| 73 |
+
},
|
| 74 |
+
"96": {
|
| 75 |
+
"BLOCK_SIZE_M": 64,
|
| 76 |
+
"BLOCK_SIZE_N": 64,
|
| 77 |
+
"BLOCK_SIZE_K": 128,
|
| 78 |
+
"GROUP_SIZE_M": 1,
|
| 79 |
+
"num_warps": 4,
|
| 80 |
+
"num_stages": 3
|
| 81 |
+
},
|
| 82 |
+
"128": {
|
| 83 |
+
"BLOCK_SIZE_M": 64,
|
| 84 |
+
"BLOCK_SIZE_N": 64,
|
| 85 |
+
"BLOCK_SIZE_K": 128,
|
| 86 |
+
"GROUP_SIZE_M": 1,
|
| 87 |
+
"num_warps": 4,
|
| 88 |
+
"num_stages": 4
|
| 89 |
+
},
|
| 90 |
+
"256": {
|
| 91 |
+
"BLOCK_SIZE_M": 128,
|
| 92 |
+
"BLOCK_SIZE_N": 64,
|
| 93 |
+
"BLOCK_SIZE_K": 128,
|
| 94 |
+
"GROUP_SIZE_M": 1,
|
| 95 |
+
"num_warps": 8,
|
| 96 |
+
"num_stages": 4
|
| 97 |
+
},
|
| 98 |
+
"512": {
|
| 99 |
+
"BLOCK_SIZE_M": 128,
|
| 100 |
+
"BLOCK_SIZE_N": 128,
|
| 101 |
+
"BLOCK_SIZE_K": 128,
|
| 102 |
+
"GROUP_SIZE_M": 16,
|
| 103 |
+
"num_warps": 8,
|
| 104 |
+
"num_stages": 2
|
| 105 |
+
},
|
| 106 |
+
"1024": {
|
| 107 |
+
"BLOCK_SIZE_M": 128,
|
| 108 |
+
"BLOCK_SIZE_N": 128,
|
| 109 |
+
"BLOCK_SIZE_K": 128,
|
| 110 |
+
"GROUP_SIZE_M": 1,
|
| 111 |
+
"num_warps": 8,
|
| 112 |
+
"num_stages": 2
|
| 113 |
+
},
|
| 114 |
+
"1536": {
|
| 115 |
+
"BLOCK_SIZE_M": 128,
|
| 116 |
+
"BLOCK_SIZE_N": 128,
|
| 117 |
+
"BLOCK_SIZE_K": 128,
|
| 118 |
+
"GROUP_SIZE_M": 1,
|
| 119 |
+
"num_warps": 8,
|
| 120 |
+
"num_stages": 2
|
| 121 |
+
},
|
| 122 |
+
"2048": {
|
| 123 |
+
"BLOCK_SIZE_M": 64,
|
| 124 |
+
"BLOCK_SIZE_N": 256,
|
| 125 |
+
"BLOCK_SIZE_K": 128,
|
| 126 |
+
"GROUP_SIZE_M": 32,
|
| 127 |
+
"num_warps": 8,
|
| 128 |
+
"num_stages": 3
|
| 129 |
+
},
|
| 130 |
+
"3072": {
|
| 131 |
+
"BLOCK_SIZE_M": 64,
|
| 132 |
+
"BLOCK_SIZE_N": 128,
|
| 133 |
+
"BLOCK_SIZE_K": 128,
|
| 134 |
+
"GROUP_SIZE_M": 1,
|
| 135 |
+
"num_warps": 4,
|
| 136 |
+
"num_stages": 4
|
| 137 |
+
},
|
| 138 |
+
"4096": {
|
| 139 |
+
"BLOCK_SIZE_M": 64,
|
| 140 |
+
"BLOCK_SIZE_N": 128,
|
| 141 |
+
"BLOCK_SIZE_K": 128,
|
| 142 |
+
"GROUP_SIZE_M": 16,
|
| 143 |
+
"num_warps": 4,
|
| 144 |
+
"num_stages": 3
|
| 145 |
+
}
|
| 146 |
+
}
|
.venv/lib/python3.11/site-packages/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"1": {
|
| 3 |
+
"BLOCK_SIZE_M": 64,
|
| 4 |
+
"BLOCK_SIZE_N": 64,
|
| 5 |
+
"BLOCK_SIZE_K": 128,
|
| 6 |
+
"GROUP_SIZE_M": 64,
|
| 7 |
+
"num_warps": 4,
|
| 8 |
+
"num_stages": 5
|
| 9 |
+
},
|
| 10 |
+
"2": {
|
| 11 |
+
"BLOCK_SIZE_M": 64,
|
| 12 |
+
"BLOCK_SIZE_N": 64,
|
| 13 |
+
"BLOCK_SIZE_K": 128,
|
| 14 |
+
"GROUP_SIZE_M": 1,
|
| 15 |
+
"num_warps": 4,
|
| 16 |
+
"num_stages": 5
|
| 17 |
+
},
|
| 18 |
+
"4": {
|
| 19 |
+
"BLOCK_SIZE_M": 64,
|
| 20 |
+
"BLOCK_SIZE_N": 64,
|
| 21 |
+
"BLOCK_SIZE_K": 128,
|
| 22 |
+
"GROUP_SIZE_M": 1,
|
| 23 |
+
"num_warps": 4,
|
| 24 |
+
"num_stages": 5
|
| 25 |
+
},
|
| 26 |
+
"8": {
|
| 27 |
+
"BLOCK_SIZE_M": 64,
|
| 28 |
+
"BLOCK_SIZE_N": 64,
|
| 29 |
+
"BLOCK_SIZE_K": 128,
|
| 30 |
+
"GROUP_SIZE_M": 16,
|
| 31 |
+
"num_warps": 4,
|
| 32 |
+
"num_stages": 4
|
| 33 |
+
},
|
| 34 |
+
"16": {
|
| 35 |
+
"BLOCK_SIZE_M": 64,
|
| 36 |
+
"BLOCK_SIZE_N": 64,
|
| 37 |
+
"BLOCK_SIZE_K": 128,
|
| 38 |
+
"GROUP_SIZE_M": 16,
|
| 39 |
+
"num_warps": 4,
|
| 40 |
+
"num_stages": 4
|
| 41 |
+
},
|
| 42 |
+
"24": {
|
| 43 |
+
"BLOCK_SIZE_M": 64,
|
| 44 |
+
"BLOCK_SIZE_N": 64,
|
| 45 |
+
"BLOCK_SIZE_K": 128,
|
| 46 |
+
"GROUP_SIZE_M": 1,
|
| 47 |
+
"num_warps": 4,
|
| 48 |
+
"num_stages": 4
|
| 49 |
+
},
|
| 50 |
+
"32": {
|
| 51 |
+
"BLOCK_SIZE_M": 64,
|
| 52 |
+
"BLOCK_SIZE_N": 64,
|
| 53 |
+
"BLOCK_SIZE_K": 128,
|
| 54 |
+
"GROUP_SIZE_M": 1,
|
| 55 |
+
"num_warps": 4,
|
| 56 |
+
"num_stages": 4
|
| 57 |
+
},
|
| 58 |
+
"48": {
|
| 59 |
+
"BLOCK_SIZE_M": 64,
|
| 60 |
+
"BLOCK_SIZE_N": 64,
|
| 61 |
+
"BLOCK_SIZE_K": 128,
|
| 62 |
+
"GROUP_SIZE_M": 1,
|
| 63 |
+
"num_warps": 4,
|
| 64 |
+
"num_stages": 4
|
| 65 |
+
},
|
| 66 |
+
"64": {
|
| 67 |
+
"BLOCK_SIZE_M": 64,
|
| 68 |
+
"BLOCK_SIZE_N": 64,
|
| 69 |
+
"BLOCK_SIZE_K": 128,
|
| 70 |
+
"GROUP_SIZE_M": 1,
|
| 71 |
+
"num_warps": 4,
|
| 72 |
+
"num_stages": 4
|
| 73 |
+
},
|
| 74 |
+
"96": {
|
| 75 |
+
"BLOCK_SIZE_M": 64,
|
| 76 |
+
"BLOCK_SIZE_N": 32,
|
| 77 |
+
"BLOCK_SIZE_K": 128,
|
| 78 |
+
"GROUP_SIZE_M": 1,
|
| 79 |
+
"num_warps": 4,
|
| 80 |
+
"num_stages": 4
|
| 81 |
+
},
|
| 82 |
+
"128": {
|
| 83 |
+
"BLOCK_SIZE_M": 64,
|
| 84 |
+
"BLOCK_SIZE_N": 32,
|
| 85 |
+
"BLOCK_SIZE_K": 128,
|
| 86 |
+
"GROUP_SIZE_M": 1,
|
| 87 |
+
"num_warps": 4,
|
| 88 |
+
"num_stages": 3
|
| 89 |
+
},
|
| 90 |
+
"256": {
|
| 91 |
+
"BLOCK_SIZE_M": 64,
|
| 92 |
+
"BLOCK_SIZE_N": 64,
|
| 93 |
+
"BLOCK_SIZE_K": 128,
|
| 94 |
+
"GROUP_SIZE_M": 1,
|
| 95 |
+
"num_warps": 4,
|
| 96 |
+
"num_stages": 3
|
| 97 |
+
},
|
| 98 |
+
"512": {
|
| 99 |
+
"BLOCK_SIZE_M": 64,
|
| 100 |
+
"BLOCK_SIZE_N": 64,
|
| 101 |
+
"BLOCK_SIZE_K": 128,
|
| 102 |
+
"GROUP_SIZE_M": 1,
|
| 103 |
+
"num_warps": 4,
|
| 104 |
+
"num_stages": 3
|
| 105 |
+
},
|
| 106 |
+
"1024": {
|
| 107 |
+
"BLOCK_SIZE_M": 64,
|
| 108 |
+
"BLOCK_SIZE_N": 64,
|
| 109 |
+
"BLOCK_SIZE_K": 128,
|
| 110 |
+
"GROUP_SIZE_M": 1,
|
| 111 |
+
"num_warps": 4,
|
| 112 |
+
"num_stages": 3
|
| 113 |
+
},
|
| 114 |
+
"1536": {
|
| 115 |
+
"BLOCK_SIZE_M": 64,
|
| 116 |
+
"BLOCK_SIZE_N": 64,
|
| 117 |
+
"BLOCK_SIZE_K": 128,
|
| 118 |
+
"GROUP_SIZE_M": 1,
|
| 119 |
+
"num_warps": 4,
|
| 120 |
+
"num_stages": 3
|
| 121 |
+
},
|
| 122 |
+
"2048": {
|
| 123 |
+
"BLOCK_SIZE_M": 64,
|
| 124 |
+
"BLOCK_SIZE_N": 128,
|
| 125 |
+
"BLOCK_SIZE_K": 128,
|
| 126 |
+
"GROUP_SIZE_M": 1,
|
| 127 |
+
"num_warps": 4,
|
| 128 |
+
"num_stages": 3
|
| 129 |
+
},
|
| 130 |
+
"3072": {
|
| 131 |
+
"BLOCK_SIZE_M": 64,
|
| 132 |
+
"BLOCK_SIZE_N": 128,
|
| 133 |
+
"BLOCK_SIZE_K": 128,
|
| 134 |
+
"GROUP_SIZE_M": 32,
|
| 135 |
+
"num_warps": 4,
|
| 136 |
+
"num_stages": 3
|
| 137 |
+
},
|
| 138 |
+
"4096": {
|
| 139 |
+
"BLOCK_SIZE_M": 64,
|
| 140 |
+
"BLOCK_SIZE_N": 128,
|
| 141 |
+
"BLOCK_SIZE_K": 128,
|
| 142 |
+
"GROUP_SIZE_M": 32,
|
| 143 |
+
"num_warps": 4,
|
| 144 |
+
"num_stages": 3
|
| 145 |
+
}
|
| 146 |
+
}
|