Spaces:

FlexTheAi
/

Flexstorydiff

Runtime error

App Files Files Community

Flexstorydiff / xformers /tests /test_attentions.py

FlexTheAi

Upload folder using huggingface_hub

e202b16 verified over 1 year ago

raw

history blame contribute delete

15.9 kB

	# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
	#
	# This source code is licensed under the BSD license found in the
	# LICENSE file in the root directory of this source tree.

	import math
	from typing import Tuple

	import pytest
	import torch

	from xformers.components import (
	InputProjection,
	InputProjectionConfig,
	MultiHeadDispatch,
	)

	# Automatically test all the registered attentions
	from xformers.components.attention import (
	_DENSITY_THRESHOLD,
	ATTENTION_REGISTRY,
	build_attention,
	)

	DEVICES = (
	[torch.device("cpu")] if not torch.cuda.is_available() else [torch.device("cuda")]
	)

	BATCH = 2
	SEQ = 128 if torch.cuda.is_available() else 36
	MODEL = 128 if torch.cuda.is_available() else 16
	GLOBAL_ATTENTION_RATIO = (
	_DENSITY_THRESHOLD * 0.9
	) # Make sure that we test the sparse implementation, no matter the threshold

	assert ATTENTION_REGISTRY.keys(), "Attention layers should have been registered"

	_non_order_invariant_attentions = ["visual", "pooling"]


	def _get_multihead(
	attention_name,
	attn_dropout,
	res_dropout,
	causal,
	heads,
	device,
	skip_output_projection=False,
	use_separate_proj_weights=True,
	):
	test_config = {
	"name": attention_name,
	"dropout": attn_dropout,
	"causal": causal,
	"seq_len": SEQ,
	"window_size": SEQ // 8 + 1, # local attention
	"attention_query_mask": torch.rand((SEQ, 1)) < GLOBAL_ATTENTION_RATIO,
	"dim_model": MODEL,
	"num_heads": heads,
	"dim_head": MODEL / heads,
	"num_rules": 2, # Compositional Attention
	"r": 0.5, # random attention, ratio of tokens that the attention can attend to
	}

	if skip_output_projection:

	def noop(x):
	return x

	test_config["out_proj"] = noop

	# Add some blocksparse layout to test the corresponding attention
	block_size = 16
	test_config["layout"] = torch.eye(
	SEQ // block_size, SEQ // block_size, dtype=torch.long
	)
	test_config["block_size"] = block_size

	attention = build_attention(test_config)

	# build a multi head dispatch to test this attention mechanism
	multi_head = MultiHeadDispatch(
	seq_len=SEQ,
	dim_model=MODEL,
	residual_dropout=res_dropout,
	num_heads=heads,
	attention=attention,
	use_separate_proj_weight=use_separate_proj_weights,
	).to(device)

	return multi_head


	@pytest.mark.parametrize("attn_dropout", [0.0, 0.3])
	@pytest.mark.parametrize("residual_dropout", [0.0, 0.1])
	@pytest.mark.parametrize("causal", [True, False])
	@pytest.mark.parametrize("heads", [1, 4])
	@pytest.mark.parametrize(
	"attention_name", ATTENTION_REGISTRY.keys() - _non_order_invariant_attentions
	)
	@pytest.mark.parametrize("device", DEVICES)
	def test_order_invariance(
	attention_name: str,
	heads: int,
	attn_dropout: float,
	residual_dropout: float,
	causal: bool,
	device: torch.device,
	):
	if (
	torch.version.hip
	and device == torch.device("cuda")
	and attention_name == "local"
	):
	# Backend calls into Sputnik library which isn't built on ROCm
	device = torch.device("cpu")

	torch.manual_seed(42)
	torch.cuda.manual_seed_all(42)

	multi_head = _get_multihead(
	attention_name,
	attn_dropout,
	residual_dropout,
	causal,
	heads,
	device,
	use_separate_proj_weights=False,
	)

	if (
	int(math.sqrt(SEQ)) ** 2 != SEQ
	and multi_head.attention.requires_squared_context
	):
	pytest.skip(f"{attention_name} requires squared sequence lengths")

	# Check that we can pass a smaller sequence
	seqs = (
	[SEQ, SEQ // 2]
	if not multi_head.attention.requires_same_k_q_dimensions
	else [SEQ]
	)

	for seq in seqs:
	# Check that the attention is invariant to a permutation of K, V
	inputs = torch.rand(BATCH, seq, MODEL, device=device)
	shuffle = torch.randperm(inputs.shape[1])
	inputs_shuffled = inputs[:, shuffle, :].clone()

	results = multi_head(inputs, inputs, inputs)
	results_shuffled = multi_head(inputs, inputs_shuffled, inputs_shuffled)
	torch.allclose(results, results_shuffled)

	# Check that the attention is equivariant to a permutation of Q,
	# meaning that the result is permuted in the same way
	results_shuffled = multi_head(inputs_shuffled, inputs, inputs)
	torch.allclose(results[:, shuffle, :], results_shuffled)

	# Check that dropout actually drops some values
	if attn_dropout > 0:
	att_1 = multi_head(inputs, inputs_shuffled, inputs)
	att_2 = multi_head(inputs, inputs_shuffled, inputs)
	assert (att_1 != att_2).any()

	# Test AMP, if available
	if device.type == "cuda":
	with torch.cuda.amp.autocast(enabled=True):
	_ = multi_head(inputs, inputs_shuffled, inputs)


	@pytest.mark.parametrize("heads", [1, 4])
	@pytest.mark.parametrize("attention_name", ["scaled_dot_product"])
	@pytest.mark.parametrize("device", DEVICES)
	def test_kqv_ordering(
	attention_name: str,
	heads: int,
	device: torch.device,
	):
	multi_head = _get_multihead(attention_name, 0.0, 0.0, False, heads, device)

	# Check kqv are not flipped
	# this will not catch all issues, but would catch a V being misplaced
	# make k and q complimentary, so that QKt is all zero and attention is uniform

	q = torch.cat(
	(
	torch.rand((1, MODEL // 2), device=device),
	torch.zeros((1, MODEL // 2), device=device),
	),
	dim=1,
	).expand((BATCH, SEQ, MODEL))

	k = torch.cat(
	(
	torch.zeros((1, MODEL // 2), device=device),
	torch.rand((1, MODEL // 2), device=device),
	),
	dim=1,
	).expand((BATCH, SEQ, MODEL))
	v = torch.rand(BATCH, SEQ, MODEL, device=device)

	# Normal call
	res = multi_head(query=q, key=k, value=v)
	for i in range(BATCH):
	assert torch.allclose(res[i, :, :], res[i, 0, :].unsqueeze(-2))

	assert not torch.allclose(res[0, :, :], res[1, :, :])

	# Flip qkv, and check that we invert the above check properly
	res_false = multi_head(query=v, key=k, value=q)
	assert torch.allclose(res_false[0, :, :], res_false[1, :, :])


	@pytest.mark.parametrize("heads", [1, 4])
	@pytest.mark.parametrize("attention_name", ["scaled_dot_product"])
	@pytest.mark.parametrize("device", DEVICES)
	def test_different_seqlen(
	attention_name: str,
	heads: int,
	device: torch.device,
	):
	multi_head = _get_multihead(attention_name, 0.0, 0.0, False, heads, device)

	# Check kqv are not flipped
	# this will not catch all issues, but would catch a V being misplaced
	# make k and q complimentary, so that QKt is all zero and attention is uniform

	q = torch.cat(
	(
	torch.rand((1, MODEL // 2), device=device),
	torch.zeros((1, MODEL // 2), device=device),
	),
	dim=1,
	).expand((BATCH, SEQ, MODEL))

	k = torch.cat(
	(
	torch.zeros((1, MODEL // 2), device=device),
	torch.rand((1, MODEL // 2), device=device),
	),
	dim=1,
	).expand((BATCH, SEQ, MODEL))
	v = torch.rand(BATCH, SEQ, MODEL, device=device)

	# Normal call
	res = multi_head(query=q, key=k, value=v)

	# Changing sequence length by dividing by two to simulate differing sequence length
	q2 = torch.cat(
	(
	torch.rand((1, MODEL // 2), device=device),
	torch.zeros((1, MODEL // 2), device=device),
	),
	dim=1,
	).expand((BATCH, SEQ // 2, MODEL))

	k2 = torch.cat(
	(
	torch.zeros((1, MODEL // 2), device=device),
	torch.rand((1, MODEL // 2), device=device),
	),
	dim=1,
	).expand((BATCH, SEQ // 2, MODEL))

	v2 = torch.rand(BATCH, SEQ // 2, MODEL, device=device)

	res2 = multi_head(query=q2, key=k2, value=v2)

	assert res.shape != res2.shape


	@pytest.mark.parametrize("proj_bias", [False, True])
	@pytest.mark.parametrize("same_sizes", [False, True])
	@pytest.mark.parametrize("same_settings", [False, True])
	def test_inproj(proj_bias: bool, same_sizes: bool, same_settings: bool):

	test_config = {
	"name": "scaled_dot_product",
	"dropout": 0.1,
	"causal": False,
	"seq_len": SEQ,
	"window_size": SEQ // 8 + 1,
	"num_heads": 1,
	"dim_head": MODEL,
	}

	attention = build_attention(test_config)

	# Construct the initial projection, test different options
	in_params = InputProjectionConfig(MODEL, MODEL, proj_bias)

	if same_settings:
	in_proj = InputProjection(in_params, None, None)
	out_features = MODEL
	else:
	out_features = MODEL if same_sizes else MODEL // 2
	in_params_flip = InputProjectionConfig(MODEL, out_features, proj_bias)
	in_proj = InputProjection(
	in_params_flip, # Q proj
	in_params_flip, # K proj
	in_params, # V proj
	)

	# build a multi head dispatch to test this attention mechanism
	multi_head = MultiHeadDispatch(
	seq_len=SEQ,
	dim_model=MODEL,
	residual_dropout=0.1,
	num_heads=1,
	attention=attention,
	in_proj_container=in_proj,
	dim_key=out_features,
	dim_value=MODEL,
	)

	# Check kqv are not flipped
	# this will not catch all issues, but would catch a V being misplaced
	# make k and q complimentary, so that QKt is all zero and attention is uniform

	q = torch.cat(
	(
	torch.rand((1, MODEL // 2)),
	torch.zeros((1, MODEL // 2)),
	),
	dim=1,
	).expand((BATCH, SEQ, MODEL))

	k = torch.cat(
	(
	torch.zeros((1, MODEL // 2)),
	torch.rand((1, MODEL // 2)),
	),
	dim=1,
	).expand((BATCH, SEQ, MODEL))
	v = torch.rand(BATCH, SEQ, MODEL)

	# just check that a FW does not assert out
	_ = multi_head(query=q, key=k, value=v)


	@pytest.mark.parametrize("heads", [1, 4])
	@pytest.mark.parametrize("attention_name", ATTENTION_REGISTRY.keys())
	@pytest.mark.parametrize("device", DEVICES)
	def test_different_kq_dimensions(
	attention_name: str,
	heads: int,
	device: torch.device,
	):

	multi_head = _get_multihead(attention_name, 0.0, 0.0, False, heads, device)

	if multi_head.attention.requires_same_k_q_dimensions:
	# pyre-fixme[29]: The library function `pytest.skip` is not supported by Pyre.
	pytest.skip(f"{attention_name} does not support different k, q dimensions yet.")

	seq_q = SEQ // 2
	q = torch.rand((BATCH, seq_q, MODEL), device=device)
	k = torch.rand((BATCH, SEQ, MODEL), device=device)
	v = torch.rand((BATCH, SEQ, MODEL), device=device)

	res = multi_head(query=q, key=k, value=v)
	assert res.shape == torch.Size([BATCH, seq_q, MODEL])


	@pytest.mark.parametrize("heads", [1, 4])
	@pytest.mark.parametrize("attention_name", ATTENTION_REGISTRY.keys())
	@pytest.mark.parametrize("device", DEVICES)
	@pytest.mark.parametrize(
	"batch_sizes",
	[
	(1, BATCH, BATCH),
	(BATCH, 1, BATCH),
	(BATCH, BATCH, 1),
	(1, 1, BATCH),
	(BATCH, 1, 1),
	(1, BATCH, 1),
	],
	)
	def test_broadcast_batch_dimension(
	attention_name: str,
	heads: int,
	device: torch.device,
	batch_sizes: Tuple[int, int, int],
	):
	Q_BATCH, K_BATCH, V_BATCH = batch_sizes
	multi_head = _get_multihead(attention_name, 0.0, 0.0, False, heads, device)

	if (
	int(math.sqrt(SEQ)) ** 2 != SEQ
	and multi_head.attention.requires_squared_context
	):
	pytest.skip(f"{attention_name} requires squared sequence lengths")

	if multi_head.attention.requires_same_k_q_dimensions:
	# pyre-fixme[29]: The library function `pytest.skip` is not supported by Pyre.
	pytest.skip(f"{attention_name} does not support different k, q dimensions yet.")

	q = torch.rand((Q_BATCH, SEQ, MODEL), device=device)
	k = torch.rand((K_BATCH, SEQ, MODEL), device=device)
	v = torch.rand((V_BATCH, SEQ, MODEL), device=device)

	res = multi_head(query=q, key=k, value=v)
	assert res.shape == torch.Size([BATCH, SEQ, MODEL])


	@pytest.mark.parametrize("heads", [1, 4])
	@pytest.mark.parametrize("attention_name", ["scaled_dot_product", "favor"])
	@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires a CUDA gpu")
	def test_causal(
	attention_name: str,
	heads: int,
	):
	"""
	Make sure that the causal flag is respected.
	The input data is orthogonal by design if causal is respected, but if the attention looks ahead this will fail
	"""

	torch.random.manual_seed(42)

	device = torch.device("cuda")

	multi_head = _get_multihead(
	attention_name,
	0.0,
	0.0,
	causal=True,
	heads=heads,
	device=device,
	skip_output_projection=True,
	)

	k = (
	torch.tril(torch.ones((SEQ, SEQ), device=device), diagonal=0)
	.unsqueeze(0)
	.expand(1, -1, -1)
	)
	q = (
	torch.triu(torch.ones((SEQ, SEQ), device=device), diagonal=0)
	.unsqueeze(0)
	.expand(1, -1, -1)
	)
	v = (
	torch.arange(SEQ, device=device)
	.float()
	.unsqueeze(0)
	.unsqueeze(-1)
	.expand(1, -1, SEQ)
	)

	# Make sure that we don´t project, to keep the embeddings orthogonal
	multi_head.attention.requires_input_projection = False

	res = multi_head(query=q, key=k, value=v).squeeze(0)

	# Consolidate along the embedding, if causal was respected the amplitude should be sorted already
	res_sum = torch.sum(res, dim=1).cpu()

	assert torch.allclose(torch.sort(res_sum)[1], torch.arange(SEQ)) or torch.allclose(
	torch.sort(res_sum, descending=True)[1], torch.arange(SEQ)
	), res_sum


	@pytest.mark.parametrize("attn_dropout", [0.0, 0.1])
	@pytest.mark.parametrize("heads", [2])
	@pytest.mark.parametrize("attention_name", ATTENTION_REGISTRY.keys())
	@pytest.mark.skipif(torch.cuda.is_available(), reason="CUDA gpu not supported yet")
	def test_torch_script_ability(
	attention_name: str,
	heads: int,
	attn_dropout: float,
	):
	if attention_name in {"favor", "global", "local", "random"}:
	# pyre-fixme[29]: The library function `pytest.skip` is not supported by Pyre.
	pytest.skip(f"{attention_name} does not support scripting yet.")

	device = torch.device("cpu")

	multi_head = _get_multihead(attention_name, attn_dropout, 0.0, False, heads, device)

	if (
	int(math.sqrt(SEQ)) ** 2 != SEQ
	and multi_head.attention.requires_squared_context
	):
	pytest.skip(f"{attention_name} requires squared sequence lengths")

	# input for tracing the function
	q = torch.rand((BATCH, SEQ, MODEL), device=device)
	k = torch.rand((BATCH, SEQ, MODEL), device=device)
	v = torch.rand((BATCH, SEQ, MODEL), device=device)

	# to make sure dropout behaves deterministically
	torch.random.manual_seed(42)
	# tracing the attention module
	traced_multi_head = torch.jit.trace(multi_head, (q, k, v))

	# create new random inputs for testing the eager model and traced model
	q = torch.rand((BATCH, SEQ, MODEL), device=device)
	k = torch.rand((BATCH, SEQ, MODEL), device=device)
	v = torch.rand((BATCH, SEQ, MODEL), device=device)

	# to make sure dropout behaves deterministically need to set the seed again
	torch.random.manual_seed(42)
	res = multi_head(query=q, key=k, value=v)

	# to make sure dropout behaves deterministically need to set the seed again
	torch.random.manual_seed(42)
	res_traced = traced_multi_head(query=q, key=k, value=v)

	assert torch.allclose(res, res_traced)


	# TODO: way more unit tests..