Spaces:

ZDisket
/

MusicLSTMDemo

Sleeping

ZDisket

add1

827b824 8 months ago

8.8 kB

	import torch
	import torch.nn as nn
	from torch import Tensor, int32
	from torch.amp import autocast
	import einx
	from einops import rearrange, pack, unpack
	from functools import wraps, partial
	from contextlib import nullcontext
	from typing import List, Tuple
	import random

	# helper functions

	def exists(v):
	return v is not None

	def default(*args):
	for arg in args:
	if exists(arg):
	return arg
	return None

	def maybe(fn):
	@wraps(fn)
	def inner(x, args, *kwargs):
	if not exists(x):
	return x
	return fn(x, args, *kwargs)
	return inner

	def pack_one(t, pattern):
	return pack([t], pattern)

	def unpack_one(t, ps, pattern):
	return unpack(t, ps, pattern)[0]

	# tensor helpers

	def round_ste(z):
	"""Round with straight through gradients."""
	zhat = z.round()
	return z + (zhat - z).detach()

	def floor_ste(z):
	zhat = z.floor()
	return z + (zhat - z).detach()

	# main class

	class FSQ(nn.Module):
	def __init__(
	self,
	levels: List[int],
	dim: int \| None = None,
	num_codebooks = 1,
	keep_num_codebooks_dim: bool \| None = None,
	scale: float \| None = None,
	allowed_dtypes: Tuple[torch.dtype, ...] = (torch.float32, torch.float64),
	channel_first: bool = False,
	projection_has_bias: bool = True,
	return_indices = True,
	force_quantization_f32 = True,
	preserve_symmetry: bool = False,
	noise_dropout = 0.0,
	):
	super().__init__()

	_levels = torch.tensor(levels, dtype=int32)
	self.register_buffer("_levels", _levels, persistent = False)

	_basis = torch.cumprod(torch.tensor([1] + levels[:-1]), dim=0, dtype=int32)
	self.register_buffer("_basis", _basis, persistent = False)

	self.scale = scale

	self.preserve_symmetry = preserve_symmetry
	self.noise_dropout = noise_dropout

	codebook_dim = len(levels)
	self.codebook_dim = codebook_dim

	effective_codebook_dim = codebook_dim * num_codebooks
	self.num_codebooks = num_codebooks
	self.effective_codebook_dim = effective_codebook_dim

	keep_num_codebooks_dim = default(keep_num_codebooks_dim, num_codebooks > 1)
	assert not (num_codebooks > 1 and not keep_num_codebooks_dim)
	self.keep_num_codebooks_dim = keep_num_codebooks_dim

	self.dim = default(dim, len(_levels) * num_codebooks)

	self.channel_first = channel_first

	has_projections = self.dim != effective_codebook_dim
	self.project_in = nn.Linear(self.dim, effective_codebook_dim, bias = projection_has_bias) if has_projections else nn.Identity()
	self.project_out = nn.Linear(effective_codebook_dim, self.dim, bias = projection_has_bias) if has_projections else nn.Identity()

	self.has_projections = has_projections

	self.return_indices = return_indices
	if return_indices:
	self.codebook_size = self._levels.prod().item()
	implicit_codebook = self._indices_to_codes(torch.arange(self.codebook_size))
	self.register_buffer("implicit_codebook", implicit_codebook, persistent = False)

	self.allowed_dtypes = allowed_dtypes
	self.force_quantization_f32 = force_quantization_f32

	def bound(self, z, eps: float = 1e-3):
	""" Bound `z`, an array of shape (..., d). """
	half_l = (self._levels - 1) * (1 + eps) / 2
	offset = torch.where(self._levels % 2 == 0, 0.5, 0.0)
	shift = (offset / half_l).atanh()
	return (z + shift).tanh() * half_l - offset

	# symmetry-preserving and noise-approximated quantization, section 3.2 in https://arxiv.org/abs/2411.19842

	def symmetry_preserving_bound(self, z):
	"""
	QL(x) = 2 / (L - 1) * [(L - 1) * (tanh(x) + 1) / 2 + 0.5] - 1
	"""
	levels_minus_1 = (self._levels - 1)
	scale = 2.0 / levels_minus_1
	bracket = (levels_minus_1 * (torch.tanh(z) + 1) / 2.0) + 0.5
	bracket = floor_ste(bracket)
	return scale * bracket - 1.0

	def quantize(self, z):
	""" Quantizes z, returns quantized zhat, same shape as z. """

	preserve_symmetry = self.preserve_symmetry
	half_width = self._levels // 2

	if preserve_symmetry:
	quantized = round_ste(self.symmetry_preserving_bound(z)) / half_width
	else:
	quantized = round_ste(self.bound(z)) / half_width

	if not self.training:
	return quantized

	batch, device, noise_dropout = z.shape[0], z.device, self.noise_dropout
	unquantized = z

	# determine where to quantize elementwise

	quantize_mask = torch.bernoulli(
	torch.full((batch,), noise_dropout, device = device)
	).bool()

	quantized = einx.where('b, b ..., b ...', quantize_mask, unquantized, quantized)

	# determine where to add a random offset elementwise

	offset_mask = torch.bernoulli(
	torch.full((batch,), noise_dropout, device = device)
	).bool()

	offset = (torch.rand_like(z) - 0.5) / half_width
	quantized = einx.where('b, b ..., b ...', offset_mask, unquantized + offset, quantized)

	return quantized

	def _scale_and_shift(self, zhat_normalized):
	half_width = self._levels // 2
	return (zhat_normalized * half_width) + half_width

	def _scale_and_shift_inverse(self, zhat):
	half_width = self._levels // 2
	return (zhat - half_width) / half_width

	def _indices_to_codes(self, indices):
	level_indices = self.indices_to_level_indices(indices)
	codes = self._scale_and_shift_inverse(level_indices)
	return codes

	def codes_to_indices(self, zhat):
	""" Converts a `code` to an index in the codebook. """
	assert zhat.shape[-1] == self.codebook_dim
	zhat = self._scale_and_shift(zhat)
	return (zhat * self._basis).sum(dim=-1).to(int32)

	def indices_to_level_indices(self, indices):
	""" Converts indices to indices at each level, perhaps needed for a transformer with factorized embeddings """
	indices = rearrange(indices, '... -> ... 1')
	codes_non_centered = (indices // self._basis) % self._levels
	return codes_non_centered

	def indices_to_codes(self, indices):
	""" Inverse of `codes_to_indices`. """
	assert exists(indices)

	is_img_or_video = indices.ndim >= (3 + int(self.keep_num_codebooks_dim))

	codes = self._indices_to_codes(indices)

	if self.keep_num_codebooks_dim:
	codes = rearrange(codes, '... c d -> ... (c d)')

	codes = self.project_out(codes)

	if is_img_or_video or self.channel_first:
	codes = rearrange(codes, 'b ... d -> b d ...')

	return codes

	def forward(self, z):
	"""
	einstein notation
	b - batch
	n - sequence (or flattened spatial dimensions)
	d - feature dimension
	c - number of codebook dim
	"""

	is_img_or_video = z.ndim >= 4
	need_move_channel_last = is_img_or_video or self.channel_first

	# standardize image or video into (batch, seq, dimension)

	if need_move_channel_last:
	z = rearrange(z, 'b d ... -> b ... d')
	z, ps = pack_one(z, 'b * d')

	assert z.shape[-1] == self.dim, f'expected dimension of {self.dim} but found dimension of {z.shape[-1]}'

	z = self.project_in(z)

	z = rearrange(z, 'b n (c d) -> b n c d', c = self.num_codebooks)

	# whether to force quantization step to be full precision or not

	force_f32 = self.force_quantization_f32
	quantization_context = partial(autocast, 'cuda', enabled = False) if force_f32 else nullcontext

	with quantization_context():
	orig_dtype = z.dtype

	if force_f32 and orig_dtype not in self.allowed_dtypes:
	z = z.float()

	codes = self.quantize(z)

	# returning indices could be optional

	indices = None

	if self.return_indices:
	indices = self.codes_to_indices(codes)

	codes = rearrange(codes, 'b n c d -> b n (c d)')

	codes = codes.to(orig_dtype)

	# project out

	out = self.project_out(codes)

	# reconstitute image or video dimensions

	if need_move_channel_last:
	out = unpack_one(out, ps, 'b * d')
	out = rearrange(out, 'b ... d -> b d ...')

	indices = maybe(unpack_one)(indices, ps, 'b * c')

	if not self.keep_num_codebooks_dim and self.return_indices:
	indices = maybe(rearrange)(indices, '... 1 -> ...')

	# return quantized output and indices

	return out, indices