Spaces:

noblebarkrr
/

mvsepless_zero_gpu

Paused

App Files Files Community

mvsepless_zero_gpu / models /windowed_roformer /modules.py

noblebarkrr

Updated to Dzeta

4f175c5 about 1 month ago

raw

history blame contribute delete

12.5 kB

	from functools import partial

	import torch
	from torch import nn
	from torch.nn import Module, ModuleList
	import torch.nn.functional as F

	from einops import rearrange, pack, unpack
	from typing import Tuple
	from functools import wraps
	from packaging import version
	from collections import namedtuple
	import os

	# PyTorch version check
	TORCH_VERSION = tuple(map(int, torch.__version__.split('.')[:2]))
	IS_TORCH_LT_2_5 = TORCH_VERSION < (2, 5)
	IS_TORCH_LT_2_0 = TORCH_VERSION < (2, 0)

	# Conditional import for flex attention
	if not IS_TORCH_LT_2_5:
	from .flex_attention_utils import (
	FlexAttention,
	generate_sliding_window_with_sinks,
	)
	else:
	FlexAttention = None
	generate_sliding_window_with_sinks = None


	def pack_one(t, pattern):
	return pack([t], pattern)


	def unpack_one(t, ps, pattern):
	return unpack(t, ps, pattern)[0]


	class RMSNorm(Module):
	def __init__(self, dim):
	super().__init__()
	self.scale = dim ** 0.5
	self.gamma = nn.Parameter(torch.ones(dim))

	def forward(self, x):
	return F.normalize(x, dim=-1) * self.scale * self.gamma


	class FeedForward(Module):
	def __init__(self, dim, mult=4, dropout=0.0):
	super().__init__()
	dim_inner = int(dim * mult)
	self.net = nn.Sequential(
	RMSNorm(dim),
	nn.Linear(dim, dim_inner),
	nn.GELU(),
	nn.Dropout(dropout),
	nn.Linear(dim_inner, dim),
	nn.Dropout(dropout),
	)

	def forward(self, x):
	return self.net(x)


	# Manual SDPA for PyTorch < 2.0
	def manual_scaled_dot_product_attention(q, k, v, attn_mask=None, is_causal=False, scale=None, dropout_p=0.0):
	"""SDPA implementation for PyTorch < 2.0"""
	if scale is None:
	scale = q.shape[-1] ** -0.5

	attn_weights = torch.matmul(q, k.transpose(-2, -1)) * scale

	if is_causal:
	seq_len = attn_weights.shape[-1]
	causal_mask = torch.triu(torch.ones(seq_len, seq_len, device=attn_weights.device), diagonal=1).bool()
	attn_weights = attn_weights.masked_fill(causal_mask, float('-inf'))

	if attn_mask is not None:
	if attn_mask.dtype == torch.bool:
	attn_weights = attn_weights.masked_fill(~attn_mask, float('-inf'))
	else:
	attn_weights = attn_weights + attn_mask

	attn_weights = torch.softmax(attn_weights, dim=-1)

	if dropout_p > 0.0 and q.requires_grad:
	attn_weights = torch.dropout(attn_weights, dropout_p, train=True)

	return torch.matmul(attn_weights, v)


	class Attention(Module):
	def __init__(
	self,
	dim,
	heads=8,
	dim_head=64,
	dropout=0.0,
	rotary_embed=None,
	flash=True,
	wsa_window_len=None,
	n_wsa_sinks=None,
	):
	super().__init__()
	self.heads = heads
	self.scale = dim_head ** -0.5
	dim_inner = heads * dim_head

	self.rotary_embed = rotary_embed

	self.attend = Attend(
	flash=flash and not IS_TORCH_LT_2_0, # Disable flash for old PyTorch
	dropout=dropout,
	wsa_window_len=wsa_window_len,
	n_wsa_sinks=n_wsa_sinks,
	)
	self.norm = RMSNorm(dim)
	self.to_qkv = nn.Linear(dim, dim_inner * 3, bias=False)

	self.to_gates = nn.Linear(dim, heads)

	self.to_out = nn.Sequential(
	nn.Linear(dim_inner, dim, bias=False), nn.Dropout(dropout)
	)

	def forward(self, x, return_attn=False):
	x = self.norm(x)

	q, k, v = rearrange(
	self.to_qkv(x), "b n (qkv h d) -> qkv b h n d", qkv=3, h=self.heads
	)

	if self.rotary_embed is not None:
	q = self.rotary_embed.rotate_queries_or_keys(q)
	k = self.rotary_embed.rotate_queries_or_keys(k)

	if return_attn:
	out, attn = self.attend(q, k, v, return_attn=True)
	else:
	out = self.attend(q, k, v)

	gates = self.to_gates(x)
	out = out * rearrange(gates, "b n h -> b h n 1").sigmoid()

	out = rearrange(out, "b h n d -> b n (h d)")
	result = self.to_out(out)

	if return_attn:
	return result, attn
	return result


	class Transformer(Module):
	def __init__(
	self,
	*,
	dim,
	depth: int = 1,
	dim_head=64,
	heads=8,
	attn_dropout=0.0,
	ff_dropout=0.0,
	ff_mult=4,
	norm_output=True,
	rotary_embed=None,
	use_flash=True,
	wsa_window_len=None,
	n_wsa_sinks=None,
	):
	super().__init__()
	self.layers = ModuleList([])

	for _ in range(depth):
	attn = Attention(
	dim=dim,
	dim_head=dim_head,
	heads=heads,
	dropout=attn_dropout,
	rotary_embed=rotary_embed,
	flash=use_flash,
	wsa_window_len=wsa_window_len,
	n_wsa_sinks=n_wsa_sinks,
	)

	self.layers.append(
	ModuleList(
	[attn, FeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout)]
	)
	)

	self.norm = RMSNorm(dim) if norm_output else nn.Identity()

	def forward(self, x):
	for attn, ff in self.layers:
	x = attn(x) + x
	x = ff(x) + x
	return self.norm(x)


	class BandSplit(Module):
	def __init__(self, dim: int, dim_inputs: Tuple[int, ...]):
	super().__init__()
	self.dim_inputs = dim_inputs
	self.to_features = ModuleList([])

	for dim_in in dim_inputs:
	net = nn.Sequential(RMSNorm(dim_in), nn.Linear(dim_in, dim))
	self.to_features.append(net)

	def forward(self, x):
	x = x.split(self.dim_inputs, dim=-1)

	outs = []
	for split_input, to_feature in zip(x, self.to_features):
	split_output = to_feature(split_input)
	outs.append(split_output)

	return torch.stack(outs, dim=-2)


	def MLP(dim_in, dim_out, dim_hidden=None, depth=1, activation=nn.Tanh):
	dim_hidden = dim_hidden if dim_hidden is not None else dim_in

	net = []
	dims = (dim_in, ((dim_hidden,) depth), dim_out)

	for ind, (layer_dim_in, layer_dim_out) in enumerate(zip(dims[:-1], dims[1:])):
	is_last = ind == (len(dims) - 2)

	net.append(nn.Linear(layer_dim_in, layer_dim_out))

	if is_last:
	continue

	net.append(activation())

	return nn.Sequential(*net)


	class MaskEstimator(Module):
	def __init__(self, dim, dim_inputs: Tuple[int, ...], depth, mlp_expansion_factor=4):
	super().__init__()
	self.dim_inputs = dim_inputs
	self.to_freqs = ModuleList([])
	dim_hidden = dim * mlp_expansion_factor

	for dim_in in dim_inputs:
	mlp = nn.Sequential(
	MLP(dim, dim_in * 2, dim_hidden=dim_hidden, depth=depth), nn.GLU(dim=-1)
	)
	self.to_freqs.append(mlp)

	def forward(self, x):
	x = x.unbind(dim=-2)

	outs = []

	for band_features, mlp in zip(x, self.to_freqs):
	freq_out = mlp(band_features)
	outs.append(freq_out)

	return torch.cat(outs, dim=-1)


	FlashAttentionConfig = namedtuple(
	"FlashAttentionConfig", ["enable_flash", "enable_math", "enable_mem_efficient"]
	)


	def once(fn):
	called = False

	@wraps(fn)
	def inner(x):
	nonlocal called
	if called:
	return
	called = True
	return fn(x)

	return inner


	print_once = once(print)


	class Attend(nn.Module):
	def __init__(
	self,
	dropout=0.0,
	flash=False,
	scale=None,
	wsa_window_len=None,
	n_wsa_sinks=None,
	):
	super().__init__()
	self.scale = scale
	self.dropout = dropout
	self.attn_dropout = nn.Dropout(dropout)
	self.wsa_window_len = wsa_window_len
	self.n_wsa_sinks = n_wsa_sinks
	self.use_flash = flash

	# Flex attention only for PyTorch >= 2.5
	if wsa_window_len is not None and n_wsa_sinks is not None and n_wsa_sinks > 0:
	if IS_TORCH_LT_2_5:
	print_once(
	f"Warning: WSA (windowed sliding attention) requires PyTorch >= 2.5.0, got {torch.__version__}. "
	"Disabling WSA and falling back to standard attention."
	)
	self.flex_attn = None
	self.wsa_window_len = None
	self.n_wsa_sinks = None
	else:
	assert not (
	version.parse(torch.__version__) < version.parse("2.5.0")
	), "in order to use flex attention, you must be using pytorch 2.5 or above"
	mask_mod = generate_sliding_window_with_sinks(wsa_window_len, n_wsa_sinks)

	self.flex_attn = FlexAttention(
	mask_mod=mask_mod,
	dropout=dropout,
	scale=scale,
	compile=True,
	)
	else:
	self.flex_attn = None

	# Flash attention warning for old PyTorch
	if self.use_flash and IS_TORCH_LT_2_0:
	print_once(
	f"Warning: Flash attention requires PyTorch >= 2.0.0, got {torch.__version__}. "
	"Falling back to standard attention."
	)
	self.use_flash = False

	self.cpu_config = FlashAttentionConfig(True, True, True)
	self.cuda_config = None

	if not torch.cuda.is_available() or not self.use_flash:
	return

	device_properties = torch.cuda.get_device_properties(torch.device("cuda"))
	device_version = version.parse(
	f"{device_properties.major}.{device_properties.minor}"
	)

	if device_version >= version.parse("8.0"):
	if os.name == "nt":
	print_once(
	"Windows OS detected, using math or mem efficient attention if input tensor is on cuda"
	)
	self.cuda_config = FlashAttentionConfig(False, True, True)
	else:
	print_once(
	"GPU Compute Capability equal or above 8.0, using flash attention if input tensor is on cuda"
	)
	self.cuda_config = FlashAttentionConfig(True, False, False)
	else:
	print_once(
	"GPU Compute Capability below 8.0, using math or mem efficient attention if input tensor is on cuda"
	)
	self.cuda_config = FlashAttentionConfig(False, True, True)

	def flash_attn(self, q, k, v):
	_, heads, q_len, _, k_len, is_cuda, device = (
	*q.shape,
	k.shape[-2],
	q.is_cuda,
	q.device,
	)

	if self.scale is not None:
	default_scale = q.shape[-1] ** -0.5
	q = q * (self.scale / default_scale)

	config = self.cuda_config if is_cuda else self.cpu_config

	# For PyTorch < 2.0, use manual attention
	if IS_TORCH_LT_2_0:
	return manual_scaled_dot_product_attention(
	q, k, v,
	dropout_p=self.dropout if self.training else 0.0
	)

	with torch.backends.cuda.sdp_kernel(**config._asdict()):
	out = F.scaled_dot_product_attention(
	q, k, v, dropout_p=self.dropout if self.training else 0.0
	)

	return out

	def forward(self, q, k, v, return_attn=False):
	# Flex attention path
	if self.flex_attn is not None:
	return self.flex_attn(q, k, v)

	# Flash attention path (PyTorch >= 2.0)
	if self.use_flash and not IS_TORCH_LT_2_0:
	return self.flash_attn(q, k, v)

	# Manual attention path (PyTorch < 2.0 or fallback)
	q_len, k_len, device = q.shape[-2], k.shape[-2], q.device
	scale = self.scale if self.scale is not None else q.shape[-1] ** -0.5

	sim = torch.einsum(f"b h i d, b h j d -> b h i j", q, k) * scale

	attn = sim.softmax(dim=-1)
	attn = self.attn_dropout(attn)

	out = torch.einsum(f"b h i j, b h j d -> b h i d", attn, v)

	if return_attn:
	return out, attn
	return out