Spaces:

multimodalart
/

StreamDiffusionV2-Realtime

Running on Zero

App Files Files Community

StreamDiffusionV2-Realtime / models /wan /wan_wrapper.py

multimodalart HF Staff

Upload folder using huggingface_hub

5c93746 verified 2 days ago

Raw

History Blame Contribute Delete

13.2 kB

	from models.model_interface import (
	DiffusionModelInterface,
	TextEncoderInterface,
	VAEInterface
	)
	from models.wan.wan_base.modules.tokenizers import HuggingfaceTokenizer
	from models.wan.wan_base.modules.model import WanModel
	from models.wan.wan_base.modules.vae import _video_vae
	from models.wan.wan_base.modules.t5 import umt5_xxl
	from models.wan.flow_match import FlowMatchScheduler
	from models.wan.causal_model import CausalWanModel
	from typing import List, Tuple, Dict, Optional
	import torch
	import os
	import torch.distributed as dist
	import time
	from pathlib import Path


	def _resolve_project_root() -> Path:
	env_root = os.environ.get("STREAMDIFFUSIONV2_ROOT")
	if env_root:
	return Path(env_root).expanduser().resolve()

	repo_root = Path(__file__).resolve().parents[2]
	if (repo_root / "wan_models").exists():
	return repo_root

	cwd = Path.cwd().resolve()
	if (cwd / "wan_models").exists():
	return cwd

	return repo_root


	PROJECT_ROOT = _resolve_project_root()


	class WanTextEncoder(TextEncoderInterface):
	def __init__(self, model_type="T2V-1.3B") -> None:
	super().__init__()

	self.text_encoder = umt5_xxl(
	encoder_only=True,
	return_tokenizer=False,
	dtype=torch.float32,
	device=torch.device('cpu')
	).eval().requires_grad_(False)
	self.text_encoder.load_state_dict(
	torch.load(
	PROJECT_ROOT / f"wan_models/Wan2.1-{model_type}/models_t5_umt5-xxl-enc-bf16.pth",
	map_location='cpu', weights_only=False
	)
	)

	self.tokenizer = HuggingfaceTokenizer(
	name=str(PROJECT_ROOT / f"wan_models/Wan2.1-{model_type}/google/umt5-xxl/"), seq_len=512, clean='whitespace')

	@property
	def device(self):
	return next(self.parameters()).device

	def forward(self, text_prompts: List[str]) -> dict:
	ids, mask = self.tokenizer(
	text_prompts, return_mask=True, add_special_tokens=True)
	ids = ids.to(self.device)
	mask = mask.to(self.device)
	seq_lens = mask.gt(0).sum(dim=1).long()
	context = self.text_encoder(ids, mask)

	for u, v in zip(context, seq_lens):
	u[v:] = 0.0 # set padding to 0.0

	return {
	"prompt_embeds": context
	}


	class WanVAEWrapper(VAEInterface):
	def __init__(self, model_type="T2V-1.3B"):
	super().__init__()
	mean = [
	-0.7571, -0.7089, -0.9113, 0.1075, -0.1745, 0.9653, -0.1517, 1.5508,
	0.4134, -0.0715, 0.5517, -0.3632, -0.1922, -0.9497, 0.2503, -0.2921
	]
	std = [
	2.8184, 1.4541, 2.3275, 2.6558, 1.2196, 1.7708, 2.6052, 2.0743,
	3.2687, 2.1526, 2.8652, 1.5579, 1.6382, 1.1253, 2.8251, 1.9160
	]
	self.mean = torch.tensor(mean, dtype=torch.float32)
	self.std = torch.tensor(std, dtype=torch.float32)

	# init model
	self.model = _video_vae(
	pretrained_path=str(PROJECT_ROOT / f"wan_models/Wan2.1-{model_type}/Wan2.1_VAE.pth"),
	z_dim=16,
	).eval().requires_grad_(False)

	def decode_to_pixel(self, latent: torch.Tensor) -> torch.Tensor:
	# from [batch_size, num_frames, num_channels, height, width]
	# to [batch_size, num_channels, num_frames, height, width]
	zs = latent.permute(0, 2, 1, 3, 4)

	device, dtype = latent.device, latent.dtype
	scale = [self.mean.to(device=device, dtype=dtype),
	1.0 / self.std.to(device=device, dtype=dtype)]

	output = [
	self.model.decode(u.unsqueeze(0),
	scale).float().clamp_(-1, 1).squeeze(0)
	for u in zs
	]
	output = torch.stack(output, dim=0)
	# from [batch_size, num_channels, num_frames, height, width]
	# to [batch_size, num_frames, num_channels, height, width]
	output = output.permute(0, 2, 1, 3, 4)
	return output

	def decode(self, latent: torch.Tensor) -> torch.Tensor:
	# from [batch_size, num_frames, num_channels, height, width]
	# to [batch_size, num_channels, num_frames, height, width]
	zs = latent.permute(0, 2, 1, 3, 4)

	device, dtype = latent.device, latent.dtype
	scale = [self.mean.to(device=device, dtype=dtype),
	1.0 / self.std.to(device=device, dtype=dtype)]

	output = self.model.decode(zs, scale).clamp_(-1, 1)
	# from [batch_size, num_channels, num_frames, height, width]
	# to [batch_size, num_frames, num_channels, height, width]
	# output = output.permute(0, 2, 1, 3, 4)
	return output

	def stream_encode(self, video: torch.Tensor, is_scale=False) -> torch.Tensor:
	if is_scale:
	device, dtype = video.device, video.dtype
	scale = [self.mean.to(device=device, dtype=dtype),
	1.0 / self.std.to(device=device, dtype=dtype)]
	else:
	scale = None
	return self.model.stream_encode(video, scale)

	def stream_decode_to_pixel(self, latent: torch.Tensor) -> torch.Tensor:
	zs = latent.permute(0, 2, 1, 3, 4)
	zs = zs.to(device=latent.device, dtype=torch.bfloat16)
	device, dtype = latent.device, latent.dtype
	scale = [self.mean.to(device=device, dtype=dtype),
	1.0 / self.std.to(device=device, dtype=dtype)]
	output = self.model.stream_decode(zs, scale).float().clamp_(-1, 1)
	output = output.permute(0, 2, 1, 3, 4)
	return output


	class WanDiffusionWrapper(DiffusionModelInterface):
	def __init__(self, model_type="T2V-1.3B"):
	super().__init__()

	self.model = WanModel.from_pretrained(str(PROJECT_ROOT / f"wan_models/Wan2.1-{model_type}/"))
	self.model.eval()

	self.uniform_timestep = True

	self.scheduler = FlowMatchScheduler(
	shift=8.0, sigma_min=0.0, extra_one_step=True
	)
	self.scheduler.set_timesteps(1000, training=True)

	self.seq_len = 32760 # [1, 21, 16, 60, 104]
	super().post_init()

	def enable_gradient_checkpointing(self) -> None:
	self.model.enable_gradient_checkpointing()

	def _convert_flow_pred_to_x0(self, flow_pred: torch.Tensor, xt: torch.Tensor, timestep: torch.Tensor) -> torch.Tensor:
	"""
	Convert flow matching's prediction to x0 prediction.
	flow_pred: the prediction with shape [B, C, H, W]
	xt: the input noisy data with shape [B, C, H, W]
	timestep: the timestep with shape [B]

	pred = noise - x0
	x_t = (1-sigma_t) * x0 + sigma_t * noise
	we have x0 = x_t - sigma_t * pred
	see derivations https://chatgpt.com/share/67bf8589-3d04-8008-bc6e-4cf1a24e2d0e
	"""
	# use higher precision for calculations
	original_dtype = flow_pred.dtype
	flow_pred, xt, sigmas, timesteps = map(
	lambda x: x.double().to(flow_pred.device), [flow_pred, xt,
	self.scheduler.sigmas,
	self.scheduler.timesteps]
	)

	timestep_id = torch.argmin(
	(timesteps.unsqueeze(0) - timestep.unsqueeze(1)).abs(), dim=1)
	sigma_t = sigmas[timestep_id].reshape(-1, 1, 1, 1)
	x0_pred = xt - sigma_t * flow_pred
	return x0_pred.to(original_dtype)

	@staticmethod
	def _convert_x0_to_flow_pred(scheduler, x0_pred: torch.Tensor, xt: torch.Tensor, timestep: torch.Tensor) -> torch.Tensor:
	"""
	Convert x0 prediction to flow matching's prediction.
	x0_pred: the x0 prediction with shape [B, C, H, W]
	xt: the input noisy data with shape [B, C, H, W]
	timestep: the timestep with shape [B]

	pred = (x_t - x_0) / sigma_t
	"""
	# use higher precision for calculations
	original_dtype = x0_pred.dtype
	x0_pred, xt, sigmas, timesteps = map(
	lambda x: x.double().to(x0_pred.device), [x0_pred, xt,
	scheduler.sigmas,
	scheduler.timesteps]
	)
	timestep_id = torch.argmin(
	(timesteps.unsqueeze(0) - timestep.unsqueeze(1)).abs(), dim=1)
	sigma_t = sigmas[timestep_id].reshape(-1, 1, 1, 1)
	flow_pred = (xt - x0_pred) / sigma_t
	return flow_pred.to(original_dtype)

	def forward(
	self, noisy_image_or_video: torch.Tensor, conditional_dict: dict,
	timestep: torch.Tensor, kv_cache: Optional[List[dict]] = None,
	crossattn_cache: Optional[List[dict]] = None,
	current_start: Optional[int] = None,
	current_end: Optional[int] = None
	) -> torch.Tensor:
	prompt_embeds = conditional_dict["prompt_embeds"]

	# [B, F] -> [B]
	if self.uniform_timestep:
	input_timestep = timestep[:, 0]
	else:
	input_timestep = timestep

	if kv_cache is not None:
	flow_pred = self.model(
	noisy_image_or_video.permute(0, 2, 1, 3, 4),
	t=input_timestep, context=prompt_embeds,
	seq_len=self.seq_len,
	kv_cache=kv_cache,
	crossattn_cache=crossattn_cache,
	current_start=current_start,
	current_end=current_end
	).permute(0, 2, 1, 3, 4)
	else:
	flow_pred = self.model(
	noisy_image_or_video.permute(0, 2, 1, 3, 4),
	t=input_timestep, context=prompt_embeds,
	seq_len=self.seq_len
	).permute(0, 2, 1, 3, 4)

	pred_x0 = self._convert_flow_pred_to_x0(
	flow_pred=flow_pred.flatten(0, 1),
	xt=noisy_image_or_video.flatten(0, 1),
	timestep=timestep.flatten(0, 1)
	).unflatten(0, flow_pred.shape[:2])

	return pred_x0

	def forward_input(
	self, noisy_image_or_video: torch.Tensor, conditional_dict: dict,
	timestep: torch.Tensor,block_mode: str='input', block_num = None, kv_cache: Optional[List[dict]] = None,
	crossattn_cache: Optional[List[dict]] = None,
	current_start: Optional[int] = None,
	current_end: Optional[int] = None,
	patched_x_shape: torch.Tensor = None,
	block_x: torch.Tensor = None,
	) -> torch.Tensor:
	assert kv_cache is not None, "kv_cache must be provided"

	prompt_embeds = conditional_dict["prompt_embeds"]

	# [B, F] -> [B]
	if self.uniform_timestep:
	input_timestep = timestep[:, 0]
	else:
	input_timestep = timestep

	if block_x is not None and block_mode == 'middle':
	noisy_image_or_video = block_x
	else:
	noisy_image_or_video = noisy_image_or_video.permute(0, 2, 1, 3, 4)

	output, patched_x_shape = self.model(
	noisy_image_or_video,
	t=input_timestep, context=prompt_embeds,
	seq_len=self.seq_len,
	kv_cache=kv_cache,
	crossattn_cache=crossattn_cache,
	current_start=current_start,
	current_end=current_end,
	block_mode=block_mode,
	block_num=block_num,
	patched_x_shape=patched_x_shape,
	)

	return output, patched_x_shape

	def forward_output(
	self, noisy_image_or_video: torch.Tensor, conditional_dict: dict,
	timestep: torch.Tensor, block_mode: str='output', block_num = None, kv_cache: Optional[List[dict]] = None,
	crossattn_cache: Optional[List[dict]] = None,
	current_start: Optional[int] = None,
	current_end: Optional[int] = None,
	patched_x_shape: torch.Tensor = None,
	block_x: torch.Tensor = None,
	) -> torch.Tensor:
	assert kv_cache is not None, "kv_cache must be provided"

	prompt_embeds = conditional_dict["prompt_embeds"]

	# [B, F] -> [B]
	if self.uniform_timestep:
	input_timestep = timestep[:, 0]
	else:
	input_timestep = timestep

	flow_pred = self.model(
	block_x,
	t=input_timestep, context=prompt_embeds,
	seq_len=self.seq_len,
	kv_cache=kv_cache,
	crossattn_cache=crossattn_cache,
	current_start=current_start,
	current_end=current_end,
	block_mode=block_mode,
	block_num=block_num,
	patched_x_shape=patched_x_shape,
	).permute(0, 2, 1, 3, 4)

	pred_x0 = self._convert_flow_pred_to_x0(
	flow_pred=flow_pred.flatten(0, 1),
	xt=noisy_image_or_video.flatten(0, 1),
	timestep=timestep.flatten(0, 1)
	).unflatten(0, flow_pred.shape[:2])

	return pred_x0


	class CausalWanDiffusionWrapper(WanDiffusionWrapper):
	def __init__(self, model_type="T2V-1.3B"):
	super().__init__()

	self.model = CausalWanModel.from_pretrained(
	str(PROJECT_ROOT / f"wan_models/Wan2.1-{model_type}/"))
	self.model.eval()

	self.uniform_timestep = False