world_model / wm /model /tokenizer /wan_tokenizer.py

Upload folder using huggingface_hub

f17ae24 verified 10 days ago

4.31 kB

	import os
	import sys
	import torch

	# Add project root to sys.path to allow absolute imports
	sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..")))

	from wm.model.wan_base.modules.vae import _video_vae


	class WanVAEWrapper(torch.nn.Module):
	def __init__(self, pretrained_path=None):
	super().__init__()

	# Mean and std for scaling latents
	mean = [
	-0.7571, -0.7089, -0.9113, 0.1075, -0.1745, 0.9653, -0.1517, 1.5508,
	0.4134, -0.0715, 0.5517, -0.3632, -0.1922, -0.9497, 0.2503, -0.2921
	]
	std = [
	2.8184, 1.4541, 2.3275, 2.6558, 1.2196, 1.7708, 2.6052, 2.0743,
	3.2687, 2.1526, 2.8652, 1.5579, 1.6382, 1.1253, 2.8251, 1.9160
	]
	self.register_buffer("mean", torch.tensor(mean, dtype=torch.float32))
	self.register_buffer("std", torch.tensor(std, dtype=torch.float32))

	# Default path if none provided
	if pretrained_path is None:
	pretrained_path = "wan_models/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth"

	# init model
	self.model = _video_vae(
	pretrained_path=pretrained_path,
	z_dim=16,
	).eval().requires_grad_(False)

	def encode(self, x: torch.Tensor) -> torch.Tensor:
	# x: [batch_size, num_frames, num_channels, height, width]
	# Convert to [batch_size, num_channels, num_frames, height, width]
	x = x.permute(0, 2, 1, 3, 4)

	device, dtype = x.device, x.dtype
	scale = [self.mean.to(device=device, dtype=dtype),
	1.0 / self.std.to(device=device, dtype=dtype)]

	latents = [
	self.model.encode(u.unsqueeze(0), scale).squeeze(0)
	for u in x
	]
	latents = torch.stack(latents, dim=0)

	# from [batch_size, num_channels, num_frames, height, width]
	# to [batch_size, num_frames, num_channels, height, width]
	latents = latents.permute(0, 2, 1, 3, 4)
	return latents

	def decode_to_pixel(self, latent: torch.Tensor) -> torch.Tensor:
	# latent: [batch_size, num_frames, num_channels, height, width]
	# to [batch_size, num_channels, num_frames, height, width]
	zs = latent.permute(0, 2, 1, 3, 4)

	device, dtype = latent.device, latent.dtype
	scale = [self.mean.to(device=device, dtype=dtype),
	1.0 / self.std.to(device=device, dtype=dtype)]

	output = [
	self.model.decode(u.unsqueeze(0),
	scale).float().clamp_(-1, 1).squeeze(0)
	for u in zs
	]
	output = torch.stack(output, dim=0)
	# from [batch_size, num_channels, num_frames, height, width]
	# to [batch_size, num_frames, num_channels, height, width]
	output = output.permute(0, 2, 1, 3, 4)
	return output

	if __name__ == "__main__":
	# Test code
	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Testing WanVAEWrapper on {device}...")

	ckpt_path = "/storage/ice-shared/ae8803che/hxue/data/checkpoint/wan_models/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth"
	if not os.path.exists(ckpt_path):
	print(f"Warning: Checkpoint not found at {ckpt_path}")
	ckpt_path = "wan_models/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth"

	try:
	vae = WanVAEWrapper(pretrained_path=ckpt_path).to(device)
	print("Model loaded successfully.")

	# Create dummy video (B, T, C, H, W)
	# T should be 1 + 4*k for Wan VAE (e.g., 5, 9, 13...)
	B, T, C, H, W = 1, 5, 3, 128, 128
	dummy_video = torch.randn(B, T, C, H, W).to(device).clamp(-1, 1)

	print(f"Input video shape: {dummy_video.shape}")

	with torch.no_grad():
	# Test encode
	latent = vae.encode(dummy_video)
	print(f"Latent shape: {latent.shape}")

	# Test decode
	recon = vae.decode_to_pixel(latent)
	print(f"Reconstructed video shape: {recon.shape}")

	mse = torch.nn.functional.mse_loss(dummy_video, recon)
	print(f"Reconstruction MSE: {mse.item():.6f}")

	except Exception as e:
	print(f"Test failed: {e}")
	import traceback
	traceback.print_exc()