Spaces:

FlexTheAi
/

Flexstorydiff

Runtime error

App Files Files Community

Flexstorydiff / xformers /tests /test_tiled_matmul.py

FlexTheAi

Upload folder using huggingface_hub

e202b16 verified over 1 year ago

raw

history blame contribute delete

5.51 kB

	# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
	#
	# This source code is licensed under the BSD license found in the
	# LICENSE file in the root directory of this source tree.

	import random

	import pytest
	import torch

	from xformers import _is_triton_available
	from xformers.ops.tiled_matmul import tiled_matmul

	cuda_only = pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
	compute_capability = (0, 0)
	if torch.cuda.is_available():
	compute_capability = torch.cuda.get_device_capability("cuda")
	cuda_sm70_only = pytest.mark.skipif(
	compute_capability < (7, 0), reason="requires sm70+"
	)

	# We care about correctness, not performance, hence let's "disable" the
	# expensive autotuning by removing all configs except one (the first one).
	if _is_triton_available():
	from xformers.ops._triton.tiled_matmul_kernels import _xformers_tiled_matmul_kernel

	while len(_xformers_tiled_matmul_kernel.configs) > 1:
	_xformers_tiled_matmul_kernel.configs.pop()


	def generate_test_shapes(*repeats, num_shapes=5):
	shapes = []
	r = random.Random(0)
	for repeat in repeats:
	m_num_tiles, n_num_tiles, k_num_tiles = repeat
	for _ in range(num_shapes):
	shapes.append(
	(
	[r.randint(2, 1024 // m_num_tiles) for _ in range(m_num_tiles)],
	[r.randint(2, 1024 // n_num_tiles) for _ in range(n_num_tiles)],
	[r.randint(2, 1024 // k_num_tiles) for _ in range(k_num_tiles)],
	)
	)
	return shapes


	_test_shapes = generate_test_shapes((1, 1, 1), (3, 3, 3))
	_dtypes = [torch.float32, torch.bfloat16, torch.float16]


	def ceil_of_ratio(n, k):
	return (n + k - 1) // k


	def make_operands(m, n, k, *, dtype):
	"""Produce lhs, rhs and reference output tensors

	To dodge numerical accuracy differences between our kernels and PyTorch's
	ones, we avoid random values and construct matrices whose product is an
	exact mathematical computation, specifically: the remainder!

	We do it by having the i-th row of lhs and the j-th column on rhs be like:
	* lhs: i times "1", followed by "0"
	* rhs: j-1 times "1", followed by "-(j-1)", then repeated
	The running sum of their pointwise product will thus be:
	1, 2, 3, ..., j-1, 0, 1, 2, 3, ... and so on
	And the final value will be remainder of i by j.

	If K is smaller than M and/or N, this function also takes care of repeating
	some rows and/or columns in order to "fill" M and/or K. Similarly, if the
	precision of the dtype is too low to store the result without losses, the
	function will only use small-enough values, and repeat them as needed.

	Finally, the function permutes the rows and columns, in order to avoid a
	predictable block structure.

	"""
	max_value = min(k, int(1 / torch.finfo(dtype).eps) * 2)
	m_perm = torch.randperm(m)
	n_perm = torch.randperm(n)

	num_reps_m = ceil_of_ratio(m, max_value)
	lhs = (
	torch.ones((min(m, max_value), k), dtype=dtype)
	.tril()
	.repeat([num_reps_m, 1])[m_perm, :]
	)
	assert lhs.shape == (m, k)

	num_reps_n = ceil_of_ratio(n, max_value)
	rhs = torch.ones((k, min(n, max_value)), dtype=dtype)
	for i in range(2, min(n, max_value) + 2):
	rhs[:, i - 2][i - 1 :: i] = -i + 1
	rhs = rhs.repeat([1, num_reps_n])[:, n_perm]
	assert rhs.shape == (k, n)

	lhs_idxs = torch.arange(1, min(m, max_value) + 1).repeat([num_reps_m])[m_perm, None]
	rhs_idxs = torch.arange(2, min(n, max_value) + 2).repeat([num_reps_n])[None, n_perm]
	out = torch.remainder(lhs_idxs, rhs_idxs).to(dtype)
	assert out.shape == (m, n)

	return lhs, rhs, out


	@cuda_only
	@cuda_sm70_only
	@pytest.mark.parametrize("shape", _test_shapes, ids=[str(x) for x in _test_shapes])
	@pytest.mark.parametrize("dtype", _dtypes, ids=[str(x) for x in _dtypes])
	def test_forward_backward(
	shape,
	dtype,
	):
	m_tiles, n_tiles, k_tiles = shape
	m, n, k = sum(m_tiles), sum(n_tiles), sum(k_tiles)

	torch.manual_seed(m * n * k)

	a, b, c_reference = make_operands(m, n, k, dtype=dtype)
	a = a.cuda().requires_grad_()
	b = b.cuda().requires_grad_()
	c_reference = c_reference.cuda()

	# In one operand make each tile have its own strides, in the other use the
	# same stride for all tiles. And make the two operands have the stride==1
	# in different dimensions.
	a_tiled = [
	[y.t().clone().t() for y in x.split(k_tiles, dim=1)]
	for x in a.split(m_tiles, dim=0)
	]
	b_tiled = [[y for y in x.split(n_tiles, dim=1)] for x in b.split(k_tiles, dim=0)]

	c_test_tiled = tiled_matmul(a_tiled, b_tiled)
	c_test = torch.cat([torch.cat(x, dim=1) for x in c_test_tiled], dim=0)

	torch.testing.assert_close(c_test, c_reference)

	# To avoid numerical issues in the backward, set things up so that we only
	# multiply by a diagonal matrix whose entries are +/- 2^{-1/0/+1} (so that
	# it only changes the sign bit and the exponent).
	diag = torch.tensor(random.choices([-2, -1, -0.5, 0.5, 1, 2], k=min(m, n)))
	grad_c = torch.zeros_like(c_test)
	torch.diag(grad_c)[:] = diag
	grad_a_reference = torch.matmul(grad_c, b.detach().t())
	grad_b_reference = torch.matmul(a.detach().t(), grad_c)

	torch.autograd.backward([c_test], [grad_c], inputs=[a, b])

	torch.testing.assert_close(a.grad, grad_a_reference)
	torch.testing.assert_close(b.grad, grad_b_reference)