Spaces:

multimodalart
/

StreamDiffusionV2-Realtime

Running on Zero

App Files Files Community

StreamDiffusionV2-Realtime / models /scheduler.py

multimodalart HF Staff

Upload folder using huggingface_hub

5c93746 verified 3 days ago

Raw

History Blame Contribute Delete

3.93 kB

	from abc import abstractmethod, ABC
	import torch


	class SchedulerInterface(ABC):
	"""
	Base class for diffusion noise schedule.
	"""
	alphas_cumprod: torch.Tensor # [T], alphas for defining the noise schedule

	@abstractmethod
	def add_noise(
	self, clean_latent: torch.Tensor,
	noise: torch.Tensor, timestep: torch.Tensor
	):
	"""
	Diffusion forward corruption process.
	Input:
	- clean_latent: the clean latent with shape [B, C, H, W]
	- noise: the noise with shape [B, C, H, W]
	- timestep: the timestep with shape [B]
	Output: the corrupted latent with shape [B, C, H, W]
	"""
	pass

	def convert_x0_to_noise(
	self, x0: torch.Tensor, xt: torch.Tensor,
	timestep: torch.Tensor
	) -> torch.Tensor:
	"""
	Convert the diffusion network's x0 prediction to noise predidction.
	x0: the predicted clean data with shape [B, C, H, W]
	xt: the input noisy data with shape [B, C, H, W]
	timestep: the timestep with shape [B]

	noise = (xt-sqrt(alpha_t)*x0) / sqrt(beta_t) (eq 11 in https://arxiv.org/abs/2311.18828)
	"""
	# use higher precision for calculations
	original_dtype = x0.dtype
	x0, xt, alphas_cumprod = map(
	lambda x: x.double().to(x0.device), [x0, xt,
	self.alphas_cumprod]
	)

	alpha_prod_t = alphas_cumprod[timestep].reshape(-1, 1, 1, 1)
	beta_prod_t = 1 - alpha_prod_t

	noise_pred = (xt - alpha_prod_t **
	(0.5) * x0) / beta_prod_t ** (0.5)
	return noise_pred.to(original_dtype)

	def convert_noise_to_x0(
	self, noise: torch.Tensor, xt: torch.Tensor,
	timestep: torch.Tensor
	) -> torch.Tensor:
	"""
	Convert the diffusion network's noise prediction to x0 predidction.
	noise: the predicted noise with shape [B, C, H, W]
	xt: the input noisy data with shape [B, C, H, W]
	timestep: the timestep with shape [B]

	x0 = (x_t - sqrt(beta_t) * noise) / sqrt(alpha_t) (eq 11 in https://arxiv.org/abs/2311.18828)
	"""
	# use higher precision for calculations
	original_dtype = noise.dtype
	noise, xt, alphas_cumprod = map(
	lambda x: x.double().to(noise.device), [noise, xt,
	self.alphas_cumprod]
	)
	alpha_prod_t = alphas_cumprod[timestep].reshape(-1, 1, 1, 1)
	beta_prod_t = 1 - alpha_prod_t

	x0_pred = (xt - beta_prod_t **
	(0.5) * noise) / alpha_prod_t ** (0.5)
	return x0_pred.to(original_dtype)

	def convert_velocity_to_x0(
	self, velocity: torch.Tensor, xt: torch.Tensor,
	timestep: torch.Tensor
	) -> torch.Tensor:
	"""
	Convert the diffusion network's velocity prediction to x0 predidction.
	velocity: the predicted noise with shape [B, C, H, W]
	xt: the input noisy data with shape [B, C, H, W]
	timestep: the timestep with shape [B]

	v = sqrt(alpha_t) * noise - sqrt(beta_t) x0
	noise = (xt-sqrt(alpha_t)*x0) / sqrt(beta_t)
	given v, x_t, we have
	x0 = sqrt(alpha_t) * x_t - sqrt(beta_t) * v
	see derivations https://chatgpt.com/share/679fb6c8-3a30-8008-9b0e-d1ae892dac56
	"""
	# use higher precision for calculations
	original_dtype = velocity.dtype
	velocity, xt, alphas_cumprod = map(
	lambda x: x.double().to(velocity.device), [velocity, xt,
	self.alphas_cumprod]
	)
	alpha_prod_t = alphas_cumprod[timestep].reshape(-1, 1, 1, 1)
	beta_prod_t = 1 - alpha_prod_t

	x0_pred = (alpha_prod_t ** 0.5) * xt - (beta_prod_t ** 0.5) * velocity
	return x0_pred.to(original_dtype)