bitsandbytes-mps / tests /test_bnb_mps.py

Upload folder using huggingface_hub

20347e1 verified about 1 month ago

8.37 kB

	"""Tests for bitsandbytes MPS 4-bit quantization kernels."""

	import pytest
	import torch

	from bitsandbytes_mps import (
	FP4,
	NF4,
	dequantize_4bit,
	gemm_4bit,
	gemv_4bit,
	linear_4bit,
	quantize_4bit,
	)

	# NF4 codebook values (matching bnb_types.h)
	NF4_CODEBOOK = [
	-1.0, -0.6961928009986877, -0.5250730514526367, -0.39491748809814453,
	-0.28444138169288635, -0.18477343022823334, -0.09105003625154495, 0.0,
	0.07958029955625534, 0.16093020141124725, 0.24611230194568634,
	0.33791524171829224, 0.44070982933044434, 0.5626170039176941,
	0.7229568362236023, 1.0,
	]

	FP4_CODEBOOK = [
	0.0, 0.005208333333, 0.66666667, 1.0, 0.33333333, 0.5, 0.16666667, 0.25,
	0.0, -0.005208333333, -0.66666667, -1.0, -0.33333333, -0.5, -0.16666667,
	-0.25,
	]

	DEVICE = "mps"


	def _reference_quantize_nf4(x_flat, blocksize):
	"""Reference Python implementation of NF4 blockwise quantization."""
	n = x_flat.numel()
	num_blocks = (n + blocksize - 1) // blocksize
	absmax = torch.zeros(num_blocks, dtype=torch.float32)
	packed = torch.zeros((n + 1) // 2, dtype=torch.uint8)

	codebook = torch.tensor(NF4_CODEBOOK, dtype=torch.float32)

	for b in range(num_blocks):
	start = b * blocksize
	end = min(start + blocksize, n)
	block = x_flat[start:end].float()
	am = block.abs().max().item()
	absmax[b] = am

	if am > 0:
	normalized = (block / am).clamp(-1, 1)
	else:
	normalized = torch.zeros_like(block)

	for i in range(0, end - start, 2):
	v0 = normalized[i].item()
	q0 = (codebook - v0).abs().argmin().item()

	q1 = 0
	if i + 1 < end - start:
	v1 = normalized[i + 1].item()
	q1 = (codebook - v1).abs().argmin().item()

	byte_idx = (start + i) // 2
	packed[byte_idx] = (q0 << 4) \| (q1 & 0x0F)

	return packed, absmax


	def _reference_dequantize_nf4(packed, absmax, blocksize, numel):
	"""Reference Python implementation of NF4 blockwise dequantization."""
	codebook = torch.tensor(NF4_CODEBOOK, dtype=torch.float32)
	output = torch.zeros(numel, dtype=torch.float32)

	for i in range(numel):
	byte_idx = i // 2
	block_idx = i // blocksize
	byte_val = packed[byte_idx].item()

	if i % 2 == 0:
	nibble = (byte_val >> 4) & 0x0F
	else:
	nibble = byte_val & 0x0F

	output[i] = codebook[nibble] * absmax[block_idx].item()

	return output


	# ============================================================================
	# Quantization / Dequantization Tests
	# ============================================================================


	@pytest.mark.parametrize("blocksize", [64, 128])
	@pytest.mark.parametrize("quant_type", [NF4, FP4])
	@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
	def test_quantize_dequantize_roundtrip(blocksize, quant_type, dtype):
	"""Test that quantize -> dequantize approximately recovers the original."""
	torch.manual_seed(42)
	n = 1024
	x = torch.randn(n, dtype=dtype, device=DEVICE)

	packed, absmax = quantize_4bit(x, blocksize=blocksize, quant_type=quant_type)

	assert packed.shape == (n // 2,)
	assert packed.dtype == torch.uint8
	assert absmax.dtype == torch.float32
	assert absmax.shape == ((n + blocksize - 1) // blocksize,)

	x_deq = dequantize_4bit(
	packed, absmax, blocksize=blocksize, quant_type=quant_type,
	numel=n, output_dtype=dtype,
	)

	assert x_deq.shape == (n,)
	assert x_deq.dtype == dtype

	# 4-bit quantization has significant error; check correlation
	x_cpu = x.float().cpu()
	x_deq_cpu = x_deq.float().cpu()
	cosine_sim = torch.nn.functional.cosine_similarity(
	x_cpu.unsqueeze(0), x_deq_cpu.unsqueeze(0)
	).item()
	assert cosine_sim > 0.95, f"Cosine similarity too low: {cosine_sim}"


	@pytest.mark.parametrize("blocksize", [64, 128])
	def test_dequantize_matches_reference(blocksize):
	"""Test dequantization matches the Python reference implementation."""
	torch.manual_seed(123)
	n = 256
	x = torch.randn(n, dtype=torch.float16, device=DEVICE)

	packed, absmax = quantize_4bit(x, blocksize=blocksize, quant_type=NF4)

	# GPU dequantize
	x_deq = dequantize_4bit(
	packed, absmax, blocksize=blocksize, quant_type=NF4,
	numel=n, output_dtype=torch.float16,
	)

	# Reference dequantize (on CPU)
	x_ref = _reference_dequantize_nf4(
	packed.cpu(), absmax.cpu(), blocksize, n
	)

	torch.testing.assert_close(
	x_deq.float().cpu(), x_ref, rtol=1e-3, atol=1e-3
	)


	# ============================================================================
	# GEMV Tests
	# ============================================================================


	@pytest.mark.parametrize("blocksize", [64, 128])
	@pytest.mark.parametrize("quant_type", [NF4, FP4])
	def test_gemv_correctness(blocksize, quant_type):
	"""Test fused GEMV against dequantize + matmul reference."""
	torch.manual_seed(42)
	N, K = 256, 256

	# Create weight and quantize
	W = torch.randn(N, K, dtype=torch.float16, device=DEVICE)
	W_flat = W.flatten()
	packed, absmax = quantize_4bit(W_flat, blocksize=blocksize, quant_type=quant_type)

	# Reshape for GEMV
	packed_w = packed.view(N, K // 2)
	absmax_w = absmax.view(N, -1)

	# Input vector
	x = torch.randn(K, dtype=torch.float16, device=DEVICE)

	# Fused GEMV
	y = gemv_4bit(x, packed_w, absmax_w, output_features=N,
	blocksize=blocksize, quant_type=quant_type)

	# Reference: dequantize then matmul
	W_deq = dequantize_4bit(packed, absmax, blocksize=blocksize,
	quant_type=quant_type, numel=N*K,
	output_dtype=torch.float16)
	W_deq = W_deq.view(N, K)
	y_ref = W_deq @ x

	# Check relative error
	rel_error = (y.float() - y_ref.float()).abs().mean() / y_ref.float().abs().mean()
	assert rel_error < 0.05, f"GEMV relative error too high: {rel_error}"


	# ============================================================================
	# GEMM Tests
	# ============================================================================


	@pytest.mark.parametrize("blocksize", [64, 128])
	@pytest.mark.parametrize("quant_type", [NF4, FP4])
	def test_gemm_correctness(blocksize, quant_type):
	"""Test fused GEMM against dequantize + matmul reference."""
	torch.manual_seed(42)
	M, N, K = 8, 128, 128

	W = torch.randn(N, K, dtype=torch.float16, device=DEVICE)
	W_flat = W.flatten()
	packed, absmax = quantize_4bit(W_flat, blocksize=blocksize, quant_type=quant_type)

	packed_w = packed.view(N, K // 2)
	absmax_w = absmax.view(N, -1)

	X = torch.randn(M, K, dtype=torch.float16, device=DEVICE)

	# Fused GEMM
	Y = gemm_4bit(X, packed_w, absmax_w, output_features=N,
	blocksize=blocksize, quant_type=quant_type)

	# Reference
	W_deq = dequantize_4bit(packed, absmax, blocksize=blocksize,
	quant_type=quant_type, numel=N*K,
	output_dtype=torch.float16)
	W_deq = W_deq.view(N, K)
	Y_ref = X @ W_deq.T

	rel_error = (Y.float() - Y_ref.float()).abs().mean() / Y_ref.float().abs().mean()
	assert rel_error < 0.05, f"GEMM relative error too high: {rel_error}"


	# ============================================================================
	# Linear layer test
	# ============================================================================


	def test_linear_4bit_auto_select():
	"""Test that linear_4bit auto-selects GEMV vs GEMM."""
	torch.manual_seed(42)
	N, K = 128, 128

	W = torch.randn(N, K, dtype=torch.float16, device=DEVICE)
	packed, absmax = quantize_4bit(W.flatten(), blocksize=64, quant_type=NF4)
	packed_w = packed.view(N, K // 2)
	absmax_w = absmax.view(N, -1)

	# Single vector - should use GEMV
	x = torch.randn(K, dtype=torch.float16, device=DEVICE)
	y = linear_4bit(x, packed_w, absmax_w, output_features=N)
	assert y.shape == (N,)

	# Batch - should use GEMM
	X = torch.randn(4, K, dtype=torch.float16, device=DEVICE)
	Y = linear_4bit(X, packed_w, absmax_w, output_features=N)
	assert Y.shape == (4, N)


	if __name__ == "__main__":
	pytest.main([__file__, "-v"])