Buckets:

leideng
/

QCFuse

Files

xet

leideng/QCFuse / srt /layers /quantization /__init__.py

leideng

13 days ago

download

raw

6.34 kB

	# Adapted from https://raw.githubusercontent.com/vllm-project/vllm/v0.5.5/vllm/model_executor/layers/quantization/__init__.py
	from __future__ import annotations

	import builtins
	import inspect
	from typing import TYPE_CHECKING, Dict, Optional, Type

	import torch

	try:
	from vllm.model_executor.layers.quantization.aqlm import AQLMConfig
	from vllm.model_executor.layers.quantization.bitsandbytes import BitsAndBytesConfig
	from vllm.model_executor.layers.quantization.deepspeedfp import DeepSpeedFPConfig
	from vllm.model_executor.layers.quantization.experts_int8 import ExpertsInt8Config
	from vllm.model_executor.layers.quantization.gguf import GGUFConfig
	from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
	GPTQMarlin24Config,
	)
	from vllm.model_executor.layers.quantization.marlin import MarlinConfig
	from vllm.model_executor.layers.quantization.qqq import QQQConfig
	from vllm.model_executor.layers.quantization.tpu_int8 import Int8TpuConfig

	VLLM_AVAILABLE = True
	except ImportError as e:
	VLLM_AVAILABLE = False
	VLLM_IMPORT_ERROR = e

	# Define empty classes as placeholders when vllm is not available
	class DummyConfig:
	def override_quantization_method(self, args, *kwargs):
	return None

	AQLMConfig = BitsAndBytesConfig = CompressedTensorsConfig = DeepSpeedFPConfig = (
	ExpertsInt8Config
	) = GGUFConfig = GPTQMarlin24Config = MarlinConfig = QQQConfig = Int8TpuConfig = (
	DummyConfig
	)


	from sglang.srt.layers.quantization.awq import AWQConfig, AWQMarlinConfig
	from sglang.srt.layers.quantization.base_config import QuantizationConfig
	from sglang.srt.layers.quantization.blockwise_int8 import BlockInt8Config
	from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors import (
	CompressedTensorsConfig,
	)
	from sglang.srt.layers.quantization.fp8 import Fp8Config
	from sglang.srt.layers.quantization.fpgemm_fp8 import FBGEMMFp8Config
	from sglang.srt.layers.quantization.gptq import GPTQConfig, GPTQMarlinConfig
	from sglang.srt.layers.quantization.modelopt_quant import (
	ModelOptFp4Config,
	ModelOptFp8Config,
	)
	from sglang.srt.layers.quantization.moe_wna16 import MoeWNA16Config
	from sglang.srt.layers.quantization.mxfp4 import Mxfp4Config
	from sglang.srt.layers.quantization.petit import PetitNvFp4Config
	from sglang.srt.layers.quantization.qoq import QoQConfig
	from sglang.srt.layers.quantization.w4afp8 import W4AFp8Config
	from sglang.srt.layers.quantization.w8a8_fp8 import W8A8Fp8Config
	from sglang.srt.layers.quantization.w8a8_int8 import W8A8Int8Config
	from sglang.srt.utils import is_cuda, is_hip, mxfp_supported

	_is_mxfp_supported = mxfp_supported()

	if TYPE_CHECKING:
	from sglang.srt.layers.moe.topk import TopKOutput

	# Base quantization methods that don't depend on vllm
	BASE_QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
	"fp8": Fp8Config,
	"blockwise_int8": BlockInt8Config,
	"modelopt": ModelOptFp8Config, # Auto-detect, defaults to FP8
	"modelopt_fp8": ModelOptFp8Config,
	"modelopt_fp4": ModelOptFp4Config,
	"w8a8_int8": W8A8Int8Config,
	"w8a8_fp8": W8A8Fp8Config,
	"awq": AWQConfig,
	"awq_marlin": AWQMarlinConfig,
	"gptq": GPTQConfig,
	"gptq_marlin": GPTQMarlinConfig,
	"moe_wna16": MoeWNA16Config,
	"compressed-tensors": CompressedTensorsConfig,
	"qoq": QoQConfig,
	"w4afp8": W4AFp8Config,
	"petit_nvfp4": PetitNvFp4Config,
	"fbgemm_fp8": FBGEMMFp8Config,
	}


	if is_cuda():
	BASE_QUANTIZATION_METHODS.update(
	{
	"quark": Mxfp4Config,
	"mxfp4": Mxfp4Config,
	}
	)
	elif _is_mxfp_supported and is_hip():
	from sglang.srt.layers.quantization.quark.quark import QuarkConfig

	BASE_QUANTIZATION_METHODS.update(
	{
	"quark": QuarkConfig,
	"mxfp4": Mxfp4Config,
	}
	)
	# VLLM-dependent quantization methods
	VLLM_QUANTIZATION_METHODS = {
	"aqlm": AQLMConfig,
	"deepspeedfp": DeepSpeedFPConfig,
	"tpu_int8": Int8TpuConfig,
	"marlin": MarlinConfig,
	"gguf": GGUFConfig,
	"gptq_marlin_24": GPTQMarlin24Config,
	"bitsandbytes": BitsAndBytesConfig,
	"qqq": QQQConfig,
	"experts_int8": ExpertsInt8Config,
	}

	QUANTIZATION_METHODS = {BASE_QUANTIZATION_METHODS, VLLM_QUANTIZATION_METHODS}


	def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
	if quantization not in QUANTIZATION_METHODS:
	raise ValueError(
	f"Invalid quantization method: {quantization}. "
	f"Available methods: {list(QUANTIZATION_METHODS.keys())}"
	)
	if quantization in VLLM_QUANTIZATION_METHODS and not VLLM_AVAILABLE:
	raise ValueError(
	f"{quantization} quantization requires some operators from vllm. "
	f"Please install vllm by `pip install vllm==0.9.0.1`\n"
	f"Import error: {VLLM_IMPORT_ERROR}"
	)

	return QUANTIZATION_METHODS[quantization]


	original_isinstance = builtins.isinstance


	def monkey_patch_isinstance_for_vllm_base_layer(reverse: bool = False):
	"""
	Patch isinstance so that the `get_quant_method` in vllm's QuantizationConfig
	can recognize sglang layers
	"""
	if not VLLM_AVAILABLE:
	return

	if reverse:
	builtins.isinstance = original_isinstance
	return

	from vllm.model_executor.layers.fused_moe import FusedMoE
	from vllm.model_executor.layers.linear import LinearBase
	from vllm.model_executor.layers.vocab_parallel_embedding import (
	VocabParallelEmbedding,
	)

	from sglang.srt.layers.linear import LinearBase as PatchedLinearBase
	from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE as PatchedFusedMoE
	from sglang.srt.layers.vocab_parallel_embedding import (
	VocabParallelEmbedding as PatchedVocabParallelEmbedding,
	)

	def patched_isinstance(obj, classinfo):
	if classinfo is LinearBase:
	return original_isinstance(obj, PatchedLinearBase)
	if classinfo is FusedMoE:
	return original_isinstance(obj, PatchedFusedMoE)
	if classinfo is VocabParallelEmbedding:
	return original_isinstance(obj, PatchedVocabParallelEmbedding)
	return original_isinstance(obj, classinfo)

	builtins.isinstance = patched_isinstance

Xet Storage Details

Size:: 6.34 kB
Xet hash:: 7777837d98cf0a42b182e455ca6e1172f069b2e4c5267aca38449ba54fe1e015

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.