Spaces:

multimodalart
/

StreamDiffusionV2-Realtime

Running on Zero

App Files Files Community

StreamDiffusionV2-Realtime / streamdiffusionv2 /pipeline.py

multimodalart HF Staff

Upload folder using huggingface_hub

5c93746 verified about 22 hours ago

Raw

History Blame Contribute Delete

16.1 kB

	"""Readable staged video-to-video API for StreamDiffusionV2."""

	from __future__ import annotations

	from contextlib import ExitStack
	from dataclasses import dataclass
	from importlib.resources import as_file, files
	from pathlib import Path
	from typing import Literal

	from diffusers.utils import export_to_video as diffusers_export_to_video
	import numpy as np
	import torch

	from models.util import set_seed
	from streamv2v.inference import (
	SingleGPUInferencePipeline as StreamBatchInferencePipeline,
	compute_noise_scale_and_step,
	)
	from streamv2v.inference_common import load_mp4_as_tensor, merge_cli_config, normalize_acceleration_flags
	from streamv2v.inference_wo_batch import SingleGPUInferencePipeline as StreamNoBatchInferencePipeline


	SingleMode = Literal["single", "single-wo"]


	@dataclass
	class VideoChunk:
	"""One video chunk prepared for the encode -> denoise -> decode loop."""

	frames: torch.Tensor
	start_idx: int
	end_idx: int
	current_start: int
	current_end: int


	@dataclass
	class EncodedChunk:
	"""Encoded latent chunk plus the schedule metadata needed for denoising."""

	noisy_latents: torch.Tensor
	current_start: int
	current_end: int
	noise_scale: float
	current_step: int \| None = None


	@dataclass
	class DenoisedChunk:
	"""Denoised latent chunk ready for VAE decoding."""

	denoised_pred: torch.Tensor
	last_frame_only: bool


	def _resolve_default_config_path(resource_stack: ExitStack) -> str:
	resource = files("streamv2v.configs").joinpath("wan_causal_dmd_v2v.yaml")
	return str(resource_stack.enter_context(as_file(resource)))


	def _resolve_device(device: str \| torch.device \| None) -> torch.device:
	cuda_available = torch.cuda.is_available()
	if device is None:
	return torch.device("cuda" if cuda_available else "cpu")
	resolved = torch.device(device)
	if resolved.type == "cuda" and not cuda_available:
	raise RuntimeError("CUDA is not available in the current Python environment")
	if resolved.type == "cuda" and resolved.index is not None:
	torch.cuda.set_device(resolved.index)
	return resolved


	def _normalize_video_tensor(
	video: str \| Path \| torch.Tensor,
	*,
	height: int,
	width: int,
	device: torch.device,
	) -> torch.Tensor:
	if isinstance(video, (str, Path)):
	tensor = load_mp4_as_tensor(str(video), resize_hw=(height, width)).unsqueeze(0)
	else:
	tensor = video
	if tensor.ndim == 4:
	tensor = tensor.unsqueeze(0)
	if tensor.ndim != 5:
	raise ValueError("video tensor must have shape [B, C, T, H, W] or [C, T, H, W]")
	if tensor.dtype != torch.bfloat16:
	tensor = tensor.to(dtype=torch.bfloat16)
	return tensor.to(device)


	def load_video(video_path: str, *, height: int = 480, width: int = 832) -> torch.Tensor:
	"""Load a video file as a normalized tensor with shape [C, T, H, W]."""
	return load_mp4_as_tensor(video_path, resize_hw=(height, width))


	def export_video(video: np.ndarray, output_path: str, *, fps: int = 16) -> str:
	"""Write a `[T, H, W, C]` float video array to an mp4 file."""
	output_file = Path(output_path)
	output_file.parent.mkdir(parents=True, exist_ok=True)
	diffusers_export_to_video(video, str(output_file), fps=fps)
	return str(output_file)


	class StreamDiffusionV2Pipeline:
	"""Readable staged single-GPU API that mirrors the offline inference flow."""

	def __init__(
	self,
	checkpoint_folder: str,
	*,
	mode: SingleMode = "single",
	config_path: str \| None = None,
	device: str \| torch.device \| None = None,
	noise_scale: float = 0.8,
	height: int = 480,
	width: int = 832,
	fps: int = 16,
	step: int = 2,
	seed: int = 0,
	model_type: str = "T2V-1.3B",
	use_taehv: bool = False,
	use_tensorrt: bool = False,
	fast: bool = False,
	profile: bool = False,
	) -> None:
	if mode not in {"single", "single-wo"}:
	raise ValueError("StreamDiffusionV2Pipeline only supports 'single' and 'single-wo'")

	self._resource_stack = ExitStack()
	self.mode = mode
	self.device = _resolve_device(device)
	self.checkpoint_folder = checkpoint_folder
	self.noise_scale = float(noise_scale)
	self.height = int(height)
	self.width = int(width)
	self.fps = int(fps)
	self.seed = int(seed)
	self.step = int(step)
	self.profile = bool(profile)
	self.model_type = model_type
	self.prompt: str \| None = None

	resolved_config_path = config_path or _resolve_default_config_path(self._resource_stack)
	self.config_path = resolved_config_path
	flags = normalize_acceleration_flags(
	{
	"use_taehv": use_taehv,
	"use_tensorrt": use_tensorrt,
	"fast": fast,
	}
	)
	self.use_taehv = bool(flags["use_taehv"])
	self.use_tensorrt = bool(flags["use_tensorrt"])
	self.fast = bool(flags["fast"])
	config_args = {
	"config_path": resolved_config_path,
	"checkpoint_folder": checkpoint_folder,
	"noise_scale": noise_scale,
	"height": height,
	"width": width,
	"fps": fps,
	"step": step,
	"seed": seed,
	"model_type": model_type,
	"profile": profile,
	"use_taehv": self.use_taehv,
	"use_tensorrt": self.use_tensorrt,
	"fast": self.fast,
	"t2v": False,
	"target_fps": None,
	"fixed_noise_scale": False,
	"num_frames": 81,
	}
	self.config = merge_cli_config(resolved_config_path, config_args)

	manager_cls = (
	StreamBatchInferencePipeline if mode == "single" else StreamNoBatchInferencePipeline
	)
	torch.set_grad_enabled(False)
	set_seed(self.seed)
	self.pipeline_manager = manager_cls(self.config, self.device)
	self.pipeline_manager.load_model(checkpoint_folder)
	self.chunk_size = 4 * self.config.num_frame_per_block
	self.num_steps = len(self.pipeline_manager.pipeline.denoising_step_list)
	self._next_chunk_index = 0

	def close(self) -> None:
	self._resource_stack.close()

	def __enter__(self) -> "StreamDiffusionV2Pipeline":
	return self

	def __exit__(self, exc_type, exc_val, exc_tb) -> None:
	self.close()

	def enable_acceleration(
	self,
	*,
	use_taehv: bool = False,
	use_tensorrt: bool = False,
	fast: bool = False,
	) -> "StreamDiffusionV2Pipeline":
	"""Rebuild the pipeline with the requested acceleration flags."""
	replacement = StreamDiffusionV2Pipeline(
	checkpoint_folder=self.checkpoint_folder,
	mode=self.mode,
	config_path=self.config_path,
	device=self.device,
	noise_scale=self.noise_scale,
	height=self.height,
	width=self.width,
	fps=self.fps,
	step=self.step,
	seed=self.seed,
	model_type=self.model_type,
	use_taehv=use_taehv,
	use_tensorrt=use_tensorrt,
	fast=fast,
	profile=self.profile,
	)
	self.close()
	self.__dict__.update(replacement.__dict__)
	return self

	def prepare(self, prompt: str) -> None:
	"""Reset the stream state and store the prompt for the next denoising pass."""
	self.prompt = prompt
	self.pipeline_manager.reset_stream_state(reset_vae_flags=True)
	self.pipeline_manager.processed = 0
	self._next_chunk_index = 0

	def chunk_video(self, video: str \| Path \| torch.Tensor) -> list[VideoChunk]:
	"""Split a full input video into the same chunks used by the offline inference loop."""
	input_video = _normalize_video_tensor(
	video,
	height=self.height,
	width=self.width,
	device=self.device,
	)
	_, _, total_frames, _, _ = input_video.shape
	if total_frames < 1 + self.chunk_size:
	raise ValueError(f"video must contain at least {1 + self.chunk_size} frames")

	chunks: list[VideoChunk] = []
	start_idx = 0
	end_idx = 1 + self.chunk_size
	current_start = 0
	current_end = self.pipeline_manager.pipeline.frame_seq_length * (1 + self.chunk_size // 4)

	chunks.append(
	VideoChunk(
	frames=input_video[:, :, start_idx:end_idx],
	start_idx=start_idx,
	end_idx=end_idx,
	current_start=current_start,
	current_end=current_end,
	)
	)

	while True:
	start_idx = end_idx
	end_idx = end_idx + self.chunk_size
	if end_idx > total_frames:
	break
	current_start = current_end
	current_end = current_end + (self.chunk_size // 4) * self.pipeline_manager.pipeline.frame_seq_length
	chunks.append(
	VideoChunk(
	frames=input_video[:, :, start_idx:end_idx],
	start_idx=start_idx,
	end_idx=end_idx,
	current_start=current_start,
	current_end=current_end,
	)
	)
	return chunks

	@torch.inference_mode()
	def encode_chunk(
	self,
	input_video: str \| Path \| torch.Tensor,
	chunk: VideoChunk,
	*,
	previous_noise_scale: float \| None = None,
	initial_noise_scale: float \| None = None,
	) -> EncodedChunk:
	"""Encode one chunk in the same style as the offline inference loop."""
	full_video = _normalize_video_tensor(
	input_video,
	height=self.height,
	width=self.width,
	device=self.device,
	)
	noise_scale = self.noise_scale if previous_noise_scale is None else float(previous_noise_scale)
	init_noise_scale = self.noise_scale if initial_noise_scale is None else float(initial_noise_scale)
	current_step = None

	if chunk.start_idx != 0:
	noise_scale, current_step = compute_noise_scale_and_step(
	full_video,
	chunk.end_idx,
	self.chunk_size,
	noise_scale,
	init_noise_scale,
	)

	latents = self.pipeline_manager._timed_stream_encode(chunk.frames)
	latents = latents.transpose(2, 1).contiguous().to(dtype=torch.bfloat16)
	noise = torch.randn_like(latents)
	return EncodedChunk(
	noisy_latents=noise * noise_scale + latents * (1 - noise_scale),
	current_start=chunk.current_start,
	current_end=chunk.current_end,
	noise_scale=float(noise_scale),
	current_step=current_step,
	)

	@torch.inference_mode()
	def encode_video(self, video: str \| Path \| torch.Tensor) -> list[EncodedChunk]:
	"""Encode a full input video into noisy latent chunks."""
	chunks: list[EncodedChunk] = []
	noise_scale = float(self.noise_scale)
	init_noise_scale = noise_scale
	video_chunks = self.chunk_video(video)
	full_video = _normalize_video_tensor(
	video,
	height=self.height,
	width=self.width,
	device=self.device,
	)
	for chunk in video_chunks:
	encoded_chunk = self.encode_chunk(
	full_video,
	chunk,
	previous_noise_scale=noise_scale,
	initial_noise_scale=init_noise_scale,
	)
	noise_scale = encoded_chunk.noise_scale
	chunks.append(encoded_chunk)
	return chunks

	@torch.inference_mode()
	def denoise_chunks(self, chunks: list[EncodedChunk]) -> list[DenoisedChunk]:
	"""Run DiT denoising over the encoded chunks."""
	if not chunks:
	raise ValueError("chunks must not be empty")
	if self.prompt is None:
	raise RuntimeError("Call prepare(prompt) before denoise_chunks(...)")

	self.prepare(self.prompt)
	outputs: list[DenoisedChunk] = []
	for chunk in chunks:
	denoised_chunk = self.denoise_chunk(chunk)
	if denoised_chunk is not None:
	outputs.append(denoised_chunk)
	return outputs

	@torch.inference_mode()
	def denoise_chunk(self, chunk: EncodedChunk) -> DenoisedChunk \| None:
	"""Run DiT on one encoded chunk and return a decodable latent when available."""
	if self.prompt is None:
	raise RuntimeError("Call prepare(prompt) before denoise_chunk(...)")

	if self._next_chunk_index == 0:
	if self.mode == "single":
	denoised_pred = self.pipeline_manager.prepare_pipeline(
	text_prompts=[self.prompt],
	noise=chunk.noisy_latents,
	current_start=chunk.current_start,
	current_end=chunk.current_end,
	)
	else:
	denoised_pred = self.pipeline_manager.prepare_pipeline(
	text_prompts=[self.prompt],
	noise=chunk.noisy_latents,
	current_start=chunk.current_start,
	current_end=chunk.current_end,
	batch_denoise=False,
	)
	self._next_chunk_index += 1
	return DenoisedChunk(denoised_pred=denoised_pred, last_frame_only=False)

	current_start = chunk.current_start
	current_end = chunk.current_end

	if current_start // self.pipeline_manager.pipeline.frame_seq_length >= self.pipeline_manager.t_refresh:
	current_start = self.pipeline_manager.pipeline.kv_cache_length - self.pipeline_manager.pipeline.frame_seq_length
	current_end = current_start + (self.chunk_size // 4) * self.pipeline_manager.pipeline.frame_seq_length

	if self.mode == "single":
	denoised_pred = self.pipeline_manager.pipeline.inference_stream(
	noise=chunk.noisy_latents,
	current_start=current_start,
	current_end=current_end,
	current_step=chunk.current_step,
	)
	self.pipeline_manager.processed += 1
	self._next_chunk_index += 1
	if self.pipeline_manager.processed < self.num_steps:
	return None
	return DenoisedChunk(denoised_pred=denoised_pred, last_frame_only=True)

	denoised_pred = self.pipeline_manager.pipeline.inference_wo_batch(
	noise=chunk.noisy_latents,
	current_start=current_start,
	current_end=current_end,
	current_step=chunk.current_step,
	)
	self.pipeline_manager.processed += 1
	self._next_chunk_index += 1
	return DenoisedChunk(denoised_pred=denoised_pred, last_frame_only=True)

	@torch.inference_mode()
	def decode_chunks(self, chunks: list[DenoisedChunk]) -> np.ndarray:
	"""Decode denoised latent chunks into a `[T, H, W, C]` video array."""
	if not chunks:
	raise ValueError("chunks must not be empty")
	decoded = [self.decode_chunk(chunk) for chunk in chunks]
	return np.concatenate(decoded, axis=0)

	@torch.inference_mode()
	def decode_chunk(self, chunk: DenoisedChunk) -> np.ndarray:
	"""Decode one denoised latent chunk into `[T, H, W, C]` frames."""
	return self.pipeline_manager._decode_video_array(
	chunk.denoised_pred,
	last_frame_only=chunk.last_frame_only,
	)

	@torch.inference_mode()
	def __call__(self, video: str \| Path \| torch.Tensor) -> np.ndarray:
	"""Run the full staged pipeline after `prepare(prompt)` has been called."""
	encoded = self.encode_video(video)
	denoised = self.denoise_chunks(encoded)
	return self.decode_chunks(denoised)