Spaces:
Running on Zero
Running on Zero
| from models.model_interface import ( | |
| DiffusionModelInterface, | |
| TextEncoderInterface, | |
| VAEInterface | |
| ) | |
| from models.wan.wan_base.modules.tokenizers import HuggingfaceTokenizer | |
| from models.wan.wan_base.modules.model import WanModel | |
| from models.wan.wan_base.modules.vae import _video_vae | |
| from models.wan.wan_base.modules.t5 import umt5_xxl | |
| from models.wan.flow_match import FlowMatchScheduler | |
| from models.wan.causal_model import CausalWanModel | |
| from typing import List, Tuple, Dict, Optional | |
| import torch | |
| import os | |
| import torch.distributed as dist | |
| import time | |
| from pathlib import Path | |
| def _resolve_project_root() -> Path: | |
| env_root = os.environ.get("STREAMDIFFUSIONV2_ROOT") | |
| if env_root: | |
| return Path(env_root).expanduser().resolve() | |
| repo_root = Path(__file__).resolve().parents[2] | |
| if (repo_root / "wan_models").exists(): | |
| return repo_root | |
| cwd = Path.cwd().resolve() | |
| if (cwd / "wan_models").exists(): | |
| return cwd | |
| return repo_root | |
| PROJECT_ROOT = _resolve_project_root() | |
| class WanTextEncoder(TextEncoderInterface): | |
| def __init__(self, model_type="T2V-1.3B") -> None: | |
| super().__init__() | |
| self.text_encoder = umt5_xxl( | |
| encoder_only=True, | |
| return_tokenizer=False, | |
| dtype=torch.float32, | |
| device=torch.device('cpu') | |
| ).eval().requires_grad_(False) | |
| self.text_encoder.load_state_dict( | |
| torch.load( | |
| PROJECT_ROOT / f"wan_models/Wan2.1-{model_type}/models_t5_umt5-xxl-enc-bf16.pth", | |
| map_location='cpu', weights_only=False | |
| ) | |
| ) | |
| self.tokenizer = HuggingfaceTokenizer( | |
| name=str(PROJECT_ROOT / f"wan_models/Wan2.1-{model_type}/google/umt5-xxl/"), seq_len=512, clean='whitespace') | |
| def device(self): | |
| return next(self.parameters()).device | |
| def forward(self, text_prompts: List[str]) -> dict: | |
| ids, mask = self.tokenizer( | |
| text_prompts, return_mask=True, add_special_tokens=True) | |
| ids = ids.to(self.device) | |
| mask = mask.to(self.device) | |
| seq_lens = mask.gt(0).sum(dim=1).long() | |
| context = self.text_encoder(ids, mask) | |
| for u, v in zip(context, seq_lens): | |
| u[v:] = 0.0 # set padding to 0.0 | |
| return { | |
| "prompt_embeds": context | |
| } | |
| class WanVAEWrapper(VAEInterface): | |
| def __init__(self, model_type="T2V-1.3B"): | |
| super().__init__() | |
| mean = [ | |
| -0.7571, -0.7089, -0.9113, 0.1075, -0.1745, 0.9653, -0.1517, 1.5508, | |
| 0.4134, -0.0715, 0.5517, -0.3632, -0.1922, -0.9497, 0.2503, -0.2921 | |
| ] | |
| std = [ | |
| 2.8184, 1.4541, 2.3275, 2.6558, 1.2196, 1.7708, 2.6052, 2.0743, | |
| 3.2687, 2.1526, 2.8652, 1.5579, 1.6382, 1.1253, 2.8251, 1.9160 | |
| ] | |
| self.mean = torch.tensor(mean, dtype=torch.float32) | |
| self.std = torch.tensor(std, dtype=torch.float32) | |
| # init model | |
| self.model = _video_vae( | |
| pretrained_path=str(PROJECT_ROOT / f"wan_models/Wan2.1-{model_type}/Wan2.1_VAE.pth"), | |
| z_dim=16, | |
| ).eval().requires_grad_(False) | |
| def decode_to_pixel(self, latent: torch.Tensor) -> torch.Tensor: | |
| # from [batch_size, num_frames, num_channels, height, width] | |
| # to [batch_size, num_channels, num_frames, height, width] | |
| zs = latent.permute(0, 2, 1, 3, 4) | |
| device, dtype = latent.device, latent.dtype | |
| scale = [self.mean.to(device=device, dtype=dtype), | |
| 1.0 / self.std.to(device=device, dtype=dtype)] | |
| output = [ | |
| self.model.decode(u.unsqueeze(0), | |
| scale).float().clamp_(-1, 1).squeeze(0) | |
| for u in zs | |
| ] | |
| output = torch.stack(output, dim=0) | |
| # from [batch_size, num_channels, num_frames, height, width] | |
| # to [batch_size, num_frames, num_channels, height, width] | |
| output = output.permute(0, 2, 1, 3, 4) | |
| return output | |
| def decode(self, latent: torch.Tensor) -> torch.Tensor: | |
| # from [batch_size, num_frames, num_channels, height, width] | |
| # to [batch_size, num_channels, num_frames, height, width] | |
| zs = latent.permute(0, 2, 1, 3, 4) | |
| device, dtype = latent.device, latent.dtype | |
| scale = [self.mean.to(device=device, dtype=dtype), | |
| 1.0 / self.std.to(device=device, dtype=dtype)] | |
| output = self.model.decode(zs, scale).clamp_(-1, 1) | |
| # from [batch_size, num_channels, num_frames, height, width] | |
| # to [batch_size, num_frames, num_channels, height, width] | |
| # output = output.permute(0, 2, 1, 3, 4) | |
| return output | |
| def stream_encode(self, video: torch.Tensor, is_scale=False) -> torch.Tensor: | |
| if is_scale: | |
| device, dtype = video.device, video.dtype | |
| scale = [self.mean.to(device=device, dtype=dtype), | |
| 1.0 / self.std.to(device=device, dtype=dtype)] | |
| else: | |
| scale = None | |
| return self.model.stream_encode(video, scale) | |
| def stream_decode_to_pixel(self, latent: torch.Tensor) -> torch.Tensor: | |
| zs = latent.permute(0, 2, 1, 3, 4) | |
| zs = zs.to(device=latent.device, dtype=torch.bfloat16) | |
| device, dtype = latent.device, latent.dtype | |
| scale = [self.mean.to(device=device, dtype=dtype), | |
| 1.0 / self.std.to(device=device, dtype=dtype)] | |
| output = self.model.stream_decode(zs, scale).float().clamp_(-1, 1) | |
| output = output.permute(0, 2, 1, 3, 4) | |
| return output | |
| class WanDiffusionWrapper(DiffusionModelInterface): | |
| def __init__(self, model_type="T2V-1.3B"): | |
| super().__init__() | |
| self.model = WanModel.from_pretrained(str(PROJECT_ROOT / f"wan_models/Wan2.1-{model_type}/")) | |
| self.model.eval() | |
| self.uniform_timestep = True | |
| self.scheduler = FlowMatchScheduler( | |
| shift=8.0, sigma_min=0.0, extra_one_step=True | |
| ) | |
| self.scheduler.set_timesteps(1000, training=True) | |
| self.seq_len = 32760 # [1, 21, 16, 60, 104] | |
| super().post_init() | |
| def enable_gradient_checkpointing(self) -> None: | |
| self.model.enable_gradient_checkpointing() | |
| def _convert_flow_pred_to_x0(self, flow_pred: torch.Tensor, xt: torch.Tensor, timestep: torch.Tensor) -> torch.Tensor: | |
| """ | |
| Convert flow matching's prediction to x0 prediction. | |
| flow_pred: the prediction with shape [B, C, H, W] | |
| xt: the input noisy data with shape [B, C, H, W] | |
| timestep: the timestep with shape [B] | |
| pred = noise - x0 | |
| x_t = (1-sigma_t) * x0 + sigma_t * noise | |
| we have x0 = x_t - sigma_t * pred | |
| see derivations https://chatgpt.com/share/67bf8589-3d04-8008-bc6e-4cf1a24e2d0e | |
| """ | |
| # use higher precision for calculations | |
| original_dtype = flow_pred.dtype | |
| flow_pred, xt, sigmas, timesteps = map( | |
| lambda x: x.double().to(flow_pred.device), [flow_pred, xt, | |
| self.scheduler.sigmas, | |
| self.scheduler.timesteps] | |
| ) | |
| timestep_id = torch.argmin( | |
| (timesteps.unsqueeze(0) - timestep.unsqueeze(1)).abs(), dim=1) | |
| sigma_t = sigmas[timestep_id].reshape(-1, 1, 1, 1) | |
| x0_pred = xt - sigma_t * flow_pred | |
| return x0_pred.to(original_dtype) | |
| def _convert_x0_to_flow_pred(scheduler, x0_pred: torch.Tensor, xt: torch.Tensor, timestep: torch.Tensor) -> torch.Tensor: | |
| """ | |
| Convert x0 prediction to flow matching's prediction. | |
| x0_pred: the x0 prediction with shape [B, C, H, W] | |
| xt: the input noisy data with shape [B, C, H, W] | |
| timestep: the timestep with shape [B] | |
| pred = (x_t - x_0) / sigma_t | |
| """ | |
| # use higher precision for calculations | |
| original_dtype = x0_pred.dtype | |
| x0_pred, xt, sigmas, timesteps = map( | |
| lambda x: x.double().to(x0_pred.device), [x0_pred, xt, | |
| scheduler.sigmas, | |
| scheduler.timesteps] | |
| ) | |
| timestep_id = torch.argmin( | |
| (timesteps.unsqueeze(0) - timestep.unsqueeze(1)).abs(), dim=1) | |
| sigma_t = sigmas[timestep_id].reshape(-1, 1, 1, 1) | |
| flow_pred = (xt - x0_pred) / sigma_t | |
| return flow_pred.to(original_dtype) | |
| def forward( | |
| self, noisy_image_or_video: torch.Tensor, conditional_dict: dict, | |
| timestep: torch.Tensor, kv_cache: Optional[List[dict]] = None, | |
| crossattn_cache: Optional[List[dict]] = None, | |
| current_start: Optional[int] = None, | |
| current_end: Optional[int] = None | |
| ) -> torch.Tensor: | |
| prompt_embeds = conditional_dict["prompt_embeds"] | |
| # [B, F] -> [B] | |
| if self.uniform_timestep: | |
| input_timestep = timestep[:, 0] | |
| else: | |
| input_timestep = timestep | |
| if kv_cache is not None: | |
| flow_pred = self.model( | |
| noisy_image_or_video.permute(0, 2, 1, 3, 4), | |
| t=input_timestep, context=prompt_embeds, | |
| seq_len=self.seq_len, | |
| kv_cache=kv_cache, | |
| crossattn_cache=crossattn_cache, | |
| current_start=current_start, | |
| current_end=current_end | |
| ).permute(0, 2, 1, 3, 4) | |
| else: | |
| flow_pred = self.model( | |
| noisy_image_or_video.permute(0, 2, 1, 3, 4), | |
| t=input_timestep, context=prompt_embeds, | |
| seq_len=self.seq_len | |
| ).permute(0, 2, 1, 3, 4) | |
| pred_x0 = self._convert_flow_pred_to_x0( | |
| flow_pred=flow_pred.flatten(0, 1), | |
| xt=noisy_image_or_video.flatten(0, 1), | |
| timestep=timestep.flatten(0, 1) | |
| ).unflatten(0, flow_pred.shape[:2]) | |
| return pred_x0 | |
| def forward_input( | |
| self, noisy_image_or_video: torch.Tensor, conditional_dict: dict, | |
| timestep: torch.Tensor,block_mode: str='input', block_num = None, kv_cache: Optional[List[dict]] = None, | |
| crossattn_cache: Optional[List[dict]] = None, | |
| current_start: Optional[int] = None, | |
| current_end: Optional[int] = None, | |
| patched_x_shape: torch.Tensor = None, | |
| block_x: torch.Tensor = None, | |
| ) -> torch.Tensor: | |
| assert kv_cache is not None, "kv_cache must be provided" | |
| prompt_embeds = conditional_dict["prompt_embeds"] | |
| # [B, F] -> [B] | |
| if self.uniform_timestep: | |
| input_timestep = timestep[:, 0] | |
| else: | |
| input_timestep = timestep | |
| if block_x is not None and block_mode == 'middle': | |
| noisy_image_or_video = block_x | |
| else: | |
| noisy_image_or_video = noisy_image_or_video.permute(0, 2, 1, 3, 4) | |
| output, patched_x_shape = self.model( | |
| noisy_image_or_video, | |
| t=input_timestep, context=prompt_embeds, | |
| seq_len=self.seq_len, | |
| kv_cache=kv_cache, | |
| crossattn_cache=crossattn_cache, | |
| current_start=current_start, | |
| current_end=current_end, | |
| block_mode=block_mode, | |
| block_num=block_num, | |
| patched_x_shape=patched_x_shape, | |
| ) | |
| return output, patched_x_shape | |
| def forward_output( | |
| self, noisy_image_or_video: torch.Tensor, conditional_dict: dict, | |
| timestep: torch.Tensor, block_mode: str='output', block_num = None, kv_cache: Optional[List[dict]] = None, | |
| crossattn_cache: Optional[List[dict]] = None, | |
| current_start: Optional[int] = None, | |
| current_end: Optional[int] = None, | |
| patched_x_shape: torch.Tensor = None, | |
| block_x: torch.Tensor = None, | |
| ) -> torch.Tensor: | |
| assert kv_cache is not None, "kv_cache must be provided" | |
| prompt_embeds = conditional_dict["prompt_embeds"] | |
| # [B, F] -> [B] | |
| if self.uniform_timestep: | |
| input_timestep = timestep[:, 0] | |
| else: | |
| input_timestep = timestep | |
| flow_pred = self.model( | |
| block_x, | |
| t=input_timestep, context=prompt_embeds, | |
| seq_len=self.seq_len, | |
| kv_cache=kv_cache, | |
| crossattn_cache=crossattn_cache, | |
| current_start=current_start, | |
| current_end=current_end, | |
| block_mode=block_mode, | |
| block_num=block_num, | |
| patched_x_shape=patched_x_shape, | |
| ).permute(0, 2, 1, 3, 4) | |
| pred_x0 = self._convert_flow_pred_to_x0( | |
| flow_pred=flow_pred.flatten(0, 1), | |
| xt=noisy_image_or_video.flatten(0, 1), | |
| timestep=timestep.flatten(0, 1) | |
| ).unflatten(0, flow_pred.shape[:2]) | |
| return pred_x0 | |
| class CausalWanDiffusionWrapper(WanDiffusionWrapper): | |
| def __init__(self, model_type="T2V-1.3B"): | |
| super().__init__() | |
| self.model = CausalWanModel.from_pretrained( | |
| str(PROJECT_ROOT / f"wan_models/Wan2.1-{model_type}/")) | |
| self.model.eval() | |
| self.uniform_timestep = False | |