vidfom
/

ltx2

Model card Files Files and versions

ltx2 / Wan2GP /models /longcat /modules /block_sparse_attention /common.py

vidfom's picture

Upload folder using huggingface_hub

31112ad verified about 2 months ago

history blame contribute delete

3.29 kB

	import triton
	import triton.language as tl
	import os

	if os.environ.get('TRITON_AUTOTUNE_ENBALE', '0') == '1':
	autotune = triton.autotune
	else:
	def autotune(args, *kwargs):
	def decorator(func):
	return func
	return decorator

	configs_gating_preset = {
	'default': {
	'BLOCK_M': 64,
	'BLOCK_N': 64,
	'num_stages': 3,
	'num_warps': 8,
	}
	}

	configs_gating = [
	triton.Config({'BLOCK_M': BM, 'BLOCK_N': BN}, num_stages=s, num_warps=w) \
	for BM in [64, 128] \
	for BN in [32, 64] \
	for s in [2, 3, 4, 5] \
	for w in [4, 8] \
	]

	gating_reevaluate_keys = ["M", "N"] if os.environ.get('TRITON_REEVALUATE_KEY', '0') == '1' else []
	@autotune(configs_gating, key=gating_reevaluate_keys)
	@triton.jit
	def _attn_fwd_gating(
	Q, K, Out,
	stride_qz, stride_qh, stride_qm, stride_qk,
	stride_kz, stride_kh, stride_kn, stride_kk,
	stride_oz, stride_oh, stride_om, stride_on,
	H, M, N,
	HEAD_DIM: tl.constexpr,
	BLOCK_M: tl.constexpr,
	BLOCK_N: tl.constexpr,
	):

	tl.static_assert(BLOCK_N <= HEAD_DIM)
	start_m = tl.program_id(0)
	off_hz = tl.program_id(1)
	off_z = off_hz // H
	off_h = off_hz % H
	q_offset = off_z.to(tl.int64) * stride_qz + off_h.to(tl.int64) * stride_qh
	k_offset = off_z.to(tl.int64) * stride_kz + off_h.to(tl.int64) * stride_kh
	o_offset = off_z.to(tl.int64) * stride_oz + off_h.to(tl.int64) * stride_oh

	# block pointers
	Q_block_ptr = tl.make_block_ptr(
	base=Q + q_offset,
	shape=(M, HEAD_DIM),
	strides=(stride_qm, stride_qk),
	offsets=(start_m * BLOCK_M, 0),
	block_shape=(BLOCK_M, HEAD_DIM),
	order=(1, 0),
	)

	K_block_ptr = tl.make_block_ptr(
	base=K + k_offset,
	shape=(HEAD_DIM, N),
	strides=(stride_kk, stride_kn),
	offsets=(0, 0),
	block_shape=(HEAD_DIM, BLOCK_N),
	order=(0, 1),
	)
	O_block_ptr = tl.make_block_ptr(
	base=Out + o_offset,
	shape=(M, N),
	strides=(stride_om, stride_on),
	offsets=(start_m * BLOCK_M, 0),
	block_shape=(BLOCK_M, BLOCK_N),
	order=(1, 0),
	)

	# load q: it will stay in SRAM throughout
	q = tl.load(Q_block_ptr, boundary_check=(0,))
	for start_n in range(0, N, BLOCK_N):
	start_n = tl.multiple_of(start_n, BLOCK_N)
	# -- compute qk ----
	k = tl.load(K_block_ptr, boundary_check=(1,))
	qk = tl.dot(q, k)

	tl.store(O_block_ptr, qk.to(Out.type.element_ty), boundary_check=(0, 1))

	K_block_ptr = tl.advance(K_block_ptr, (0, BLOCK_N))
	O_block_ptr = tl.advance(O_block_ptr, (0, BLOCK_N))


	@triton.jit
	def _attn_bwd_preprocess(
	O, DO,
	Delta, # output
	N_CTX,
	BLOCK_M: tl.constexpr,
	HEAD_DIM: tl.constexpr
	):

	off_m = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
	off_hz = tl.program_id(1)
	off_n = tl.arange(0, HEAD_DIM)
	# load
	o = tl.load(O + off_hz * HEAD_DIM * N_CTX + off_m[:, None] * HEAD_DIM + off_n[None, :])
	do = tl.load(DO + off_hz * HEAD_DIM * N_CTX + off_m[:, None] * HEAD_DIM + off_n[None, :]).to(tl.float32)
	delta = tl.sum(o * do, axis=1)
	# write-back
	tl.store(Delta + off_hz * N_CTX + off_m, delta)