fastgen-offline / FastGen /tests /test_network.py

Upload folder using huggingface_hub

0839907 verified 2 months ago

68.7 kB

	# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
	# SPDX-License-Identifier: Apache-2.0

	import os
	import torch
	import pytest

	from fastgen.utils import instantiate

	# Set CUDA memory configuration for better memory management
	os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True,max_split_size_mb:512")


	from fastgen.configs.net import (
	EDM_CIFAR10_Config,
	EDM_ImageNet64_Config,
	EDM2_IN64_S_Config,
	DiT_IN256_XL_Config,
	SD15Config,
	FluxConfig,
	CogVideoXConfig,
	Wan_1_3B_Config,
	CausalWan_1_3B_Config,
	VACE_Wan_1_3B_Config,
	Wan21_I2V_14B_480P_Config,
	Wan22_I2V_5B_Config,
	CausalWan22_I2V_5B_Config,
	CausalWan21_I2V_14B_480P_Config,
	CausalWan21_I2V_14B_720P_Config,
	)
	from fastgen.configs.discriminator import (
	Discriminator_Wan_1_3B_Config,
	Discriminator_EDM_CIFAR10_Config,
	Discriminator_EDM_ImageNet64_Config,
	)
	from fastgen.configs.config_utils import override_config_with_opts
	from fastgen.utils.basic_utils import clear_gpu_memory
	from fastgen.utils.test_utils import RunIf
	from fastgen.utils.io_utils import set_env_vars
	from unittest.mock import patch, MagicMock


	def _validate_basic_scheduler_properties(scheduler, device):
	"""Test basic scheduler properties and structure."""
	# Test basic attributes
	assert hasattr(scheduler, "max_sigma"), "Scheduler should have max_sigma attribute"
	assert hasattr(scheduler, "min_t"), "Scheduler should have min_t attribute"
	assert hasattr(scheduler, "max_t"), "Scheduler should have max_t attribute"
	assert hasattr(scheduler, "alpha"), "Scheduler should have alpha method"
	assert hasattr(scheduler, "sigma"), "Scheduler should have sigma method"
	assert hasattr(scheduler, "sample_t"), "Scheduler should have sample_t method"

	# Validate time range
	assert scheduler.min_t < scheduler.max_t, f"min_t ({scheduler.min_t}) should be < max_t ({scheduler.max_t})"

	if scheduler.min_t is not None:
	assert scheduler.min_t >= 0, f"min_t ({scheduler.min_t}) should be non-negative"

	if scheduler.max_t is not None:
	# Allow very large max_t values (some schedules use very large values)
	assert scheduler.max_t <= 10000, f"max_t ({scheduler.max_t}) should be reasonable (<=10000)"


	def _validate_time_sampling_and_functions(scheduler, device):
	"""Test time sampling and alpha/sigma function behavior."""
	# Test time sampling and validation
	batch_size = 2
	t = scheduler.sample_t(batch_size, time_dist_type="uniform")
	assert t.shape == (batch_size,), f"Expected shape ({batch_size},), got {t.shape}"

	# Only validate if scheduler has proper time bounds
	assert scheduler.is_t_valid(t), f"Sampled times {t} should be valid"

	# Test alpha and sigma functions with sampled times
	alpha_t = scheduler.alpha(t)
	sigma_t = scheduler.sigma(t)

	assert alpha_t.shape == t.shape, f"alpha(t) shape {alpha_t.shape} should match t shape {t.shape}"
	assert sigma_t.shape == t.shape, f"sigma(t) shape {sigma_t.shape} should match t shape {t.shape}"
	assert torch.all(alpha_t >= 0), f"alpha(t) should be non-negative, got {alpha_t}"
	assert torch.all(sigma_t >= 0), f"sigma(t) should be non-negative, got {sigma_t}"

	return t, alpha_t, sigma_t


	def _validate_boundary_values(scheduler, device):
	"""Test alpha and sigma at boundary time values."""

	# Test boundary values
	min_t_tensor = torch.tensor([scheduler.min_t]).to(device)
	max_t_tensor = torch.tensor([scheduler.max_t]).to(device)

	alpha_min = scheduler.alpha(min_t_tensor)
	alpha_max = scheduler.alpha(max_t_tensor)
	sigma_min = scheduler.sigma(min_t_tensor)
	sigma_max = scheduler.sigma(max_t_tensor)

	# max_sigma should match sigma at max_t (with relaxed tolerance for floating point precision)
	max_sigma_tensor = torch.tensor(scheduler.max_sigma, device=device, dtype=sigma_max.dtype)

	# Use higher tolerance for bfloat16 due to lower precision
	rtol = 1e-2 if sigma_max.dtype == torch.bfloat16 else 1e-3
	atol = 1e-2 if sigma_max.dtype == torch.bfloat16 else 1e-3

	assert torch.allclose(
	sigma_max, max_sigma_tensor, rtol=rtol, atol=atol
	), f"max_sigma ({scheduler.max_sigma}) should match sigma(max_t) ({sigma_max.item()}) within tolerance"

	return alpha_min, alpha_max, sigma_min, sigma_max


	def _validate_edm_schedule(scheduler, t, alpha_t, sigma_t, sigma_min, sigma_max):
	"""Validate EDM-specific schedule properties."""
	# EDM: alpha(t) = 1, sigma(t) = t
	assert torch.allclose(alpha_t, torch.ones_like(alpha_t), rtol=1e-4), f"EDM: alpha(t) should be 1, got {alpha_t}"
	assert torch.allclose(sigma_t, t, rtol=1e-4), f"EDM: sigma(t) should equal t, got σ(t)={sigma_t}, t={t}"

	# SNR should be monotonically decreasing (sigma increases)
	assert sigma_min <= sigma_max, (
	f"EDM: sigma should increase with t, got "
	f"σ({scheduler.min_t})={sigma_min.item():.4f} "
	f"> σ({scheduler.max_t})={sigma_max.item():.4f}"
	)

	# max_sigma should match max_t for EDM
	assert torch.allclose(
	torch.tensor(scheduler.max_sigma), torch.tensor(scheduler.max_t), rtol=1e-3
	), f"EDM: max_sigma ({scheduler.max_sigma}) should equal max_t ({scheduler.max_t})"

	if scheduler.max_t is not None:
	assert scheduler.max_t <= 100.0, f"EDM: max_t should be reasonable (≤100), got {scheduler.max_t}"


	def _validate_rectified_flow_schedule(scheduler, device, t, alpha_t, sigma_t):
	"""Validate Rectified Flow-specific mathematical properties."""
	# Rectified Flow Schedule - Linear Interpolation Properties

	# 1. Core RF property: α(t) = 1-t, σ(t) = t, so α(t) + σ(t) = 1
	assert torch.allclose(
	alpha_t + sigma_t, torch.ones_like(alpha_t), rtol=1e-4, atol=1e-5
	), f"RF: α(t) + σ(t) should equal 1, got α+σ = {(alpha_t + sigma_t).tolist()}"

	# 2. Linear relationships for RF
	assert torch.allclose(
	alpha_t, 1.0 - t, rtol=1e-4, atol=1e-5
	), f"RF: α(t) should equal 1-t, got α(t)={alpha_t.tolist()}, expected={1.0 - t}"
	assert torch.allclose(
	sigma_t, t, rtol=1e-4, atol=1e-5
	), f"RF: σ(t) should equal t, got σ(t)={sigma_t.tolist()}, expected={t.tolist()}"

	# 3. Boundary conditions (with relaxed tolerance)
	max_t_tensor = torch.tensor(scheduler.max_t, dtype=torch.float32)
	max_sigma_tensor = torch.tensor(scheduler.max_sigma, dtype=torch.float32)
	assert torch.allclose(
	max_sigma_tensor, max_t_tensor, rtol=1e-3, atol=1e-4
	), f"RF: max_sigma should equal max_t, got max_σ={scheduler.max_sigma}, max_t={scheduler.max_t}"

	# 4. Test that at t≈0, α≈1 and σ≈0 (if min_t is close to 0)
	if scheduler.min_t <= 0.01:
	near_zero_t = torch.tensor([scheduler.min_t]).to(device)
	alpha_zero = scheduler.alpha(near_zero_t)
	sigma_zero = scheduler.sigma(near_zero_t)
	expected_alpha_zero = 1.0 - scheduler.min_t
	assert torch.allclose(alpha_zero, torch.tensor([expected_alpha_zero]).to(device), rtol=1e-2), (
	f"RF: α(t≈0) should ≈ 1-min_t, got "
	f"α({scheduler.min_t})={alpha_zero.item():.4f}, "
	f"expected {expected_alpha_zero:.4f}"
	)
	assert torch.allclose(sigma_zero, near_zero_t, rtol=1e-2), (
	f"RF: σ(t≈0) should ≈ min_t, "
	f"got σ({scheduler.min_t})={sigma_zero.item():.4f}, "
	f"expected {scheduler.min_t}"
	)

	# 5. Variance preservation: α²(t) + σ²(t) = (1-t)² + t² = 1 - 2t + 2t²
	variance_sum = alpha_t2 + sigma_t2
	expected_variance = 1 - 2 * t + 2 * t**2
	assert torch.allclose(variance_sum, expected_variance, rtol=1e-3, atol=1e-4), (
	f"RF: α²(t) + σ²(t) should equal 1-2t+2t², got "
	f"{variance_sum.tolist()}, expected {expected_variance.tolist()}"
	)


	def _validate_ddpm_based_schedule(scheduler, alpha_t, sigma_t, alpha_min, alpha_max, sigma_min, sigma_max):
	"""Validate DDPM-based schedule properties (shared by SD, CogVideoX, Alphas)."""
	# DDPM constraint: alpha²(t) + sigma²(t) = 1
	alpha_squared_plus_sigma_squared = alpha_t2 + sigma_t2
	expected_ones = torch.ones_like(alpha_squared_plus_sigma_squared)
	assert torch.allclose(
	alpha_squared_plus_sigma_squared, expected_ones, rtol=1e-2, atol=1e-3
	), f"DDPM: α²(t) + σ²(t) should equal 1, got {alpha_squared_plus_sigma_squared}"

	# Monotonicity checks
	assert alpha_min >= alpha_max, (
	f"DDPM: alpha should decrease with t, "
	f"got α({scheduler.min_t})={alpha_min.item():.4f} < "
	f"α({scheduler.max_t})={alpha_max.item():.4f}"
	)
	assert sigma_min <= sigma_max, (
	f"DDPM: sigma should increase with t, "
	f"got σ({scheduler.min_t})={sigma_min.item():.4f} "
	f"> σ({scheduler.max_t})={sigma_max.item():.4f}"
	)


	def validate_noise_scheduler_properties(teacher, device, expected_schedule_type=None):
	"""
	Comprehensive and consistent noise scheduler testing helper.

	Args:
	teacher: The instantiated network model
	device: Device to run tests on
	expected_schedule_type: Expected schedule type string (e.g., "edm", "rf", "sd")
	"""
	# Basic existence check
	assert hasattr(teacher, "noise_scheduler"), "Model should have noise_scheduler attribute"

	scheduler = teacher.noise_scheduler
	assert teacher.schedule_type == expected_schedule_type
	assert scheduler.max_t is not None and scheduler.min_t is not None

	# Step 1: Validate basic properties
	_validate_basic_scheduler_properties(scheduler, device)

	# Step 2: Test time sampling and alpha/sigma functions
	t, alpha_t, sigma_t = _validate_time_sampling_and_functions(scheduler, device)

	# Step 3: Test boundary values
	alpha_min, alpha_max, sigma_min, sigma_max = _validate_boundary_values(scheduler, device)

	# Step 4: Schedule-specific validations
	if expected_schedule_type in ["edm"]:
	_validate_edm_schedule(scheduler, t, alpha_t, sigma_t, sigma_min, sigma_max)

	elif expected_schedule_type in ["rf", "rectified_flow"]:
	_validate_rectified_flow_schedule(scheduler, device, t, alpha_t, sigma_t)

	elif expected_schedule_type in ["sd", "sdxl"]:
	_validate_ddpm_based_schedule(scheduler, alpha_t, sigma_t, alpha_min, alpha_max, sigma_min, sigma_max)

	elif expected_schedule_type in ["cogvideox"]:
	_validate_ddpm_based_schedule(scheduler, alpha_t, sigma_t, alpha_min, alpha_max, sigma_min, sigma_max)

	elif expected_schedule_type in ["alphas"]:
	_validate_ddpm_based_schedule(scheduler, alpha_t, sigma_t, alpha_min, alpha_max, sigma_min, sigma_max)

	else:
	raise ValueError(f"Unrecognized schedule type: {expected_schedule_type}")


	def test_network_edm_cifar10():
	teacher_config = EDM_CIFAR10_Config

	device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
	dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32

	# Use valid parameters that exist in the config
	teacher_config = override_config_with_opts(
	teacher_config,
	["-", "img_resolution=2", "model_channels=32", "channel_mult=[1]", "channel_mult_noise=1", "r_timestep=False"],
	)

	teacher = instantiate(teacher_config)
	teacher = teacher.to(device=device, dtype=dtype)

	# Test noise scheduler properties
	validate_noise_scheduler_properties(teacher, device, expected_schedule_type="edm")

	batch_size = 1
	x = torch.randn(batch_size, 3, 2, 2, device=device, dtype=dtype)
	# Use scheduler to sample valid time steps
	t = teacher.noise_scheduler.sample_t(batch_size, time_dist_type="polynomial").to(device=device, dtype=dtype)
	labels = torch.randint(0, 10, (batch_size,), device=device)
	# to one-hot
	labels = torch.nn.functional.one_hot(labels, num_classes=10).to(dtype=dtype)
	output = teacher(x, t, labels)

	assert output.shape == x.shape # confirm output shape is the same as input shape
	assert output.device == x.device # confirm output device is the same as input device

	# Test feature extraction with empty set first to avoid index issues
	output = teacher(x, t, labels, return_features_early=True, feature_indices=set())
	assert isinstance(output, list) # confirm output is a list


	def test_network_edm_imagenet64():
	teacher_config = EDM_ImageNet64_Config

	device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
	dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32

	# Use valid parameters that exist in the config
	teacher_config = override_config_with_opts(
	teacher_config,
	["-", "img_resolution=2", "model_channels=32", "channel_mult=[1]", "num_blocks=1", "r_timestep=False"],
	)

	teacher = instantiate(teacher_config)
	teacher = teacher.to(device=device, dtype=dtype)

	# Test noise scheduler properties
	validate_noise_scheduler_properties(teacher, device, expected_schedule_type="edm")

	batch_size = 1
	x = torch.randn(batch_size, 3, 2, 2, device=device, dtype=dtype)
	# Use scheduler to sample valid time steps
	t = teacher.noise_scheduler.sample_t(batch_size, time_dist_type="lognormal").to(device=device, dtype=dtype)
	labels = torch.randint(0, 1000, (batch_size,), device=device)
	# to one-hot
	labels = torch.nn.functional.one_hot(labels, num_classes=1000).to(dtype=dtype)

	output = teacher(x, t, labels)

	assert output.shape == x.shape # confirm output shape is the same as input shape
	assert output.device == x.device # confirm output device is the same as input device

	# Test feature extraction with empty set first to avoid index issues
	output = teacher(x, t, labels, return_features_early=True, feature_indices=set())
	assert isinstance(output, list) # confirm output is a list


	def test_network_edm2_in64():
	teacher_config = EDM2_IN64_S_Config

	device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
	dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32

	# Use valid parameters that exist in the config
	teacher_config = override_config_with_opts(
	teacher_config, ["-", "img_resolution=2", "model_channels=32", "channel_mult=[1]", "num_blocks=1"]
	)

	teacher = instantiate(teacher_config)
	teacher = teacher.to(device=device, dtype=dtype)

	# Test noise scheduler properties
	validate_noise_scheduler_properties(teacher, device, expected_schedule_type="edm")

	batch_size = 1
	x = torch.randn(batch_size, 3, 2, 2, device=device, dtype=dtype)
	# Use scheduler to sample valid time steps
	t = teacher.noise_scheduler.sample_t(batch_size, time_dist_type="uniform").to(device=device, dtype=dtype)
	labels = torch.randint(0, 1000, (batch_size,), device=device)
	# to one-hot
	labels = torch.nn.functional.one_hot(labels, num_classes=1000).to(dtype=dtype)
	output = teacher(x, t, labels)

	assert output.shape == x.shape # confirm output shape is the same as input shape
	assert output.device == x.device # confirm output device is the same as input device

	# Test with empty feature_indices - returns just the model output
	output_empty = teacher(x, t, labels, feature_indices=set())
	assert output_empty.shape == torch.Size([batch_size, 3, 2, 2]) # confirm score network output shape

	# Test with non-empty feature_indices - returns [model_output, features]
	# But since we have num_blocks=1, there might not be any features, so let's test with return_features_early
	features = teacher(x, t, labels, return_features_early=True, feature_indices=set())
	assert isinstance(features, list) # confirm output is a list
	assert len(features) == 0 # empty feature_indices should return empty list


	def test_network_dit_in256_xl():
	"""
	Lightweight test that mocks the VAE to avoid downloading models.
	"""
	teacher_config = DiT_IN256_XL_Config

	device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
	# dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
	dtype = torch.float16

	# Override input_size to match our test input dimensions
	teacher_config = override_config_with_opts(
	teacher_config, ["-", "input_size=2", "hidden_size=32", "depth=1", "num_heads=1"]
	)

	# Mock the VAE to avoid downloading
	with patch("diffusers.AutoencoderKL.from_pretrained") as mock_vae_from_pretrained:
	mock_vae = MagicMock()
	mock_vae.decode.return_value = torch.randn(1, 3, 16, 16, device=device, dtype=dtype)
	mock_vae_from_pretrained.return_value = mock_vae

	teacher = instantiate(teacher_config)
	teacher.vae = mock_vae

	# Ensure the model is on the correct device and dtype
	teacher = teacher.to(device=device, dtype=dtype)

	batch_size = 1
	# Use input size that matches the overridden config
	x = torch.randn(batch_size, 4, 2, 2, device=device, dtype=dtype)
	# Use scheduler to sample valid time steps with correct dtype
	t = teacher.noise_scheduler.sample_t(batch_size, time_dist_type="uniform")
	t = t.to(device=device, dtype=dtype) # Ensure timestep has correct dtype

	# Test noise scheduler properties comprehensively
	validate_noise_scheduler_properties(teacher, device, expected_schedule_type="rf")

	labels = torch.randint(0, 1000, (batch_size,), device=device)
	labels = torch.nn.functional.one_hot(labels, num_classes=1000).to(device=device, dtype=dtype)

	# Test basic forward pass
	output = teacher(x, t, labels)
	assert output.shape == x.shape
	assert output.device == x.device

	# Test with return_logvar
	output, logvar = teacher(x, t, labels, return_logvar=True)
	assert output.shape == x.shape
	assert logvar.shape == torch.Size([batch_size, 1])


	@RunIf(min_gpus=1)
	@pytest.mark.large_model
	def test_network_sd15():
	teacher_config = SD15Config

	device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
	dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32

	# SD15 doesn't support these overrides, so we'll test with default config
	# but use smaller inputs for testing
	teacher = instantiate(teacher_config)
	teacher.init_preprocessors()
	# No dtype cast yet, so we can test autocast first
	teacher = teacher.to(device=device)
	teacher.vae.to(device=device, dtype=dtype)
	teacher.text_encoder.to(device=device, dtype=dtype)

	# Test noise scheduler properties comprehensively
	validate_noise_scheduler_properties(teacher, device, expected_schedule_type="sd")

	batch_size = 1
	x = torch.randn(batch_size, 4, 8, 8, device=device, dtype=dtype) # Smaller than original but reasonable
	t = teacher.noise_scheduler.sample_t(batch_size, time_dist_type="uniform").to(device)

	captions = ["a caption"]
	condition = teacher.text_encoder.encode(captions)

	# SD15 text encoder returns (embeddings, attention_mask) tuple
	assert isinstance(condition, tuple) and len(condition) == 2
	embeddings, attention_mask = condition
	embeddings = embeddings.to(device=device, dtype=dtype)
	attention_mask = attention_mask.to(device=device, dtype=dtype)
	condition = (embeddings, attention_mask)

	# Do an autocasted forward pass
	with torch.autocast(device_type=device.type, dtype=dtype):
	output = teacher(x, t, condition=condition)

	assert output.shape == x.shape # confirm output shape is the same as input shape
	assert output.device == x.device # confirm output device is the same as input device

	teacher = teacher.to(device=device, dtype=dtype)
	# Forward pass without autocast
	output = teacher(x, t, condition=condition)

	output = teacher(x, t, condition=condition, return_features_early=True, feature_indices=set())

	assert isinstance(output, list) # confirm output is a list


	@RunIf(min_gpus=1)
	@pytest.mark.large_model
	def test_network_flux():
	"""Test Flux network for text-to-image generation."""
	# Clear memory before starting memory-intensive test
	clear_gpu_memory()

	teacher_config = FluxConfig

	device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
	dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32

	# Check available GPU memory before attempting to load Flux model
	if torch.cuda.is_available():
	total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3) # GB
	if total_memory < 40: # Flux model needs significant GPU memory
	pytest.skip(f"Test skipped: Flux model requires ~40GB GPU memory, but only {total_memory:.1f}GB available")

	# Try to instantiate Flux model - skip if not accessible (gated model)
	try:
	teacher = instantiate(teacher_config)
	except OSError as e:
	if "not a valid model identifier" in str(e) or "token" in str(e):
	pytest.skip(f"Test skipped: Flux model not accessible (requires HuggingFace authentication): {e}")
	raise
	teacher.init_preprocessors()
	# No dtype cast yet, so we can test autocast first
	teacher = teacher.to(device=device)
	teacher.vae.to(device=device, dtype=dtype)
	teacher.text_encoder.to(device=device, dtype=dtype)

	# Test noise scheduler properties - Flux uses rectified flow
	validate_noise_scheduler_properties(teacher, device, expected_schedule_type="rf")

	batch_size = 1
	# Flux operates on latent space: [B, C, H, W] where C=16 for Flux VAE
	x = torch.randn(batch_size, 16, 8, 8, device=device, dtype=dtype)
	t = teacher.noise_scheduler.sample_t(batch_size, time_dist_type="uniform").to(device=device, dtype=dtype)

	captions = ["a caption"]
	condition = teacher.text_encoder.encode(captions)

	guidance_scale = 3.5
	guidance_tensor = torch.full((batch_size,), guidance_scale, device=x.device, dtype=x.dtype)

	# Flux text encoder returns (pooled_prompt_embeds, prompt_embeds) tuple
	assert isinstance(condition, tuple) and len(condition) == 2
	pooled_prompt_embeds, prompt_embeds = condition
	pooled_prompt_embeds = pooled_prompt_embeds.to(device=device, dtype=dtype)
	prompt_embeds = prompt_embeds.to(device=device, dtype=dtype)
	condition = (pooled_prompt_embeds, prompt_embeds)

	# Do an autocasted forward pass
	with torch.autocast(device_type=device.type, dtype=dtype):
	output = teacher(x, t, condition, guidance=guidance_tensor)

	assert output.shape == x.shape # confirm output shape is the same as input shape
	assert output.device == x.device # confirm output device is the same as input device

	teacher = teacher.to(device=device, dtype=dtype)

	# Test with return_logvar
	output, logvar = teacher(x, t, condition, guidance=guidance_tensor, return_logvar=True)
	assert output.shape == x.shape
	assert logvar.shape == torch.Size([batch_size, 1])

	# Test feature extraction with empty set
	output = teacher(x, t, condition, guidance=guidance_tensor, return_features_early=True, feature_indices=set())
	assert isinstance(output, list) # confirm output is a list
	assert len(output) == 0 # empty feature_indices should return empty list

	# Test feature extraction with non-empty set (extract from first transformer block)
	output = teacher(x, t, condition, guidance=guidance_tensor, return_features_early=False, feature_indices={0})
	assert isinstance(output, list) and len(output) == 2 # [model_output, features]
	assert output[0].shape == x.shape # model output shape
	assert isinstance(output[1], list) and len(output[1]) == 1 # one feature extracted

	# Test feature extraction with early return
	features = teacher(x, t, condition, guidance=guidance_tensor, return_features_early=True, feature_indices={0})
	assert isinstance(features, list)
	assert len(features) == 1 # one feature extracted

	# Clear memory after testing
	clear_gpu_memory()


	@RunIf(min_gpus=1)
	@pytest.mark.large_model
	def test_network_cogvideox():
	# Clear memory before starting memory-intensive test
	clear_gpu_memory()

	set_env_vars()
	teacher_config = CogVideoXConfig

	device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
	dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32

	# CogVideoX doesn't support model_channels override, use default config but test with smaller inputs
	teacher = instantiate(teacher_config)

	teacher.init_preprocessors()
	# No dtype cast yet, so we can test autocast first
	teacher = teacher.to(device=device)
	teacher.vae.to(device=device, dtype=dtype)
	teacher.text_encoder.to(device=device, dtype=dtype)

	# Test noise scheduler properties comprehensively
	validate_noise_scheduler_properties(teacher, device, expected_schedule_type="cogvideox")

	batch_size = 1
	C, T, H, W = 16, 2, 4, 4 # Reduced from 16, 4, 8, 16

	# B, C, T, H, W
	x = torch.randn(batch_size, C, T, H, W, device=device, dtype=dtype)

	# Use scheduler to sample valid time steps (CogVideoX uses integer timesteps)
	t = teacher.noise_scheduler.sample_t(batch_size, time_dist_type="uniform").to(device=device, dtype=dtype)

	captions = ["a caption"]
	condition = teacher.text_encoder.encode(captions)
	# Handle case where text encoder returns tuple or needs device placement
	if isinstance(condition, tuple):
	condition = condition[0] # Take the first element if it's a tuple
	condition = condition.to(device=device, dtype=dtype)

	# Do an autocasted forward pass
	with torch.autocast(device_type=device.type, dtype=dtype):
	output = teacher(x, t, condition=condition)

	assert output.shape == x.shape # confirm output shape is the same as input shape
	assert output.device == x.device # confirm output device is the same as input device

	teacher = teacher.to(device=device, dtype=dtype)

	# return features, without early return
	output = teacher(x, t, condition=condition, return_features_early=False, feature_indices={0})
	assert output[0].shape == x.shape
	assert isinstance(output[1], list) and len(output[1]) == 1
	for feature in output[1]:
	assert feature.shape == (batch_size, 480, T, H, W)

	# return features, with early return
	output = teacher(x, t, condition=condition, return_features_early=True, feature_indices={0})
	assert isinstance(output, list) # confirm output is a list
	for feature in output:
	# Feature shape should match the new tensor dimensions (original model channels)
	expected_channels = 480 # Original CogVideoX channels
	assert feature.shape == (batch_size, expected_channels, T, H, W)

	# Clear memory after testing
	clear_gpu_memory()


	@RunIf(min_gpus=1)
	@pytest.mark.large_model
	def test_network_wan():
	# Clear memory before starting memory-intensive test
	clear_gpu_memory()

	set_env_vars()
	teacher_config = Wan_1_3B_Config

	device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
	dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32

	# Wan config doesn't support model_channels/num_blocks override, use default
	teacher = instantiate(teacher_config)

	teacher.init_preprocessors()
	# No dtype cast yet, so we can test autocast first
	teacher = teacher.to(device=device)
	teacher.vae.to(device=device, dtype=dtype)
	teacher.text_encoder.to(device=device, dtype=dtype)

	# Test noise scheduler properties comprehensively
	validate_noise_scheduler_properties(teacher, device, expected_schedule_type="rf")

	batch_size = 1
	T, H, W = 2, 4, 4 # Reduced from 4, 8, 16

	x = torch.randn(batch_size, 16, T, H, W, device=device, dtype=dtype) # [B, C, T, H, W]
	# Use scheduler to sample valid time steps
	t = teacher.noise_scheduler.sample_t(batch_size, time_dist_type="uniform").to(device=device, dtype=dtype)

	condition = teacher.text_encoder.encode(["a caption"])
	# Handle case where text encoder returns tuple or needs device placement
	if isinstance(condition, tuple):
	condition = condition[0] # Take the first element if it's a tuple
	condition = condition.to(device=device, dtype=dtype)

	# Do an autocasted forward pass
	with torch.autocast(device_type=device.type, dtype=dtype):
	output = teacher(x, t, condition=condition)

	assert output.shape == x.shape # confirm output shape is the same as input shape
	assert output.device == x.device # confirm output device is the same as input device

	teacher = teacher.to(device=device, dtype=dtype)

	# Forward pass without autocast
	output = teacher(x, t, condition=condition)

	output = teacher(x, t, condition=condition, return_features_early=True, feature_indices={0})

	assert isinstance(output, list) # confirm output is a list

	for feature in output:
	# Use original model channels (not overridden)
	expected_channels = 384 # Original Wan channels
	assert feature.shape == (batch_size, expected_channels, T, H, W)

	# test rf schedule
	teacher_config.schedule_type = "rf"
	teacher = instantiate(teacher_config)
	teacher.init_preprocessors()
	teacher = teacher.to(device=device, dtype=dtype)

	# Test RF schedule properties comprehensively
	validate_noise_scheduler_properties(teacher, device, expected_schedule_type="rf")

	# Clear memory after testing
	clear_gpu_memory()


	@RunIf(min_gpus=1)
	@pytest.mark.large_model
	def test_network_vace_wan():
	# Clear memory before starting memory-intensive test
	clear_gpu_memory()

	# Check available GPU memory before loading model
	if torch.cuda.is_available():
	free_memory = (torch.cuda.get_device_properties(0).total_memory - torch.cuda.memory_allocated(0)) / (1024**3)
	if free_memory < 20: # Need ~20GB free for this test
	pytest.skip(f"Test skipped: requires ~20GB free GPU memory, but only {free_memory:.1f}GB available")

	set_env_vars()
	teacher_config = VACE_Wan_1_3B_Config

	device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
	dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32

	teacher = instantiate(teacher_config)

	teacher.init_preprocessors()
	# No dtype cast yet, so we can test autocast first
	teacher = teacher.to(device=device)
	teacher.vae.to(device=device, dtype=dtype)
	teacher.text_encoder.to(device=device, dtype=dtype)

	# Test noise scheduler properties comprehensively
	validate_noise_scheduler_properties(teacher, device, expected_schedule_type="rf")

	batch_size = 1
	# B, C, T, H, W
	x = torch.randn(batch_size, 16, 2, 4, 4, device=device, dtype=dtype) # Reduced from 21, 60, 104
	# Use scheduler to sample valid time steps
	t = teacher.noise_scheduler.sample_t(batch_size, time_dist_type="logitnormal").to(device=device, dtype=dtype)

	# Prepare text embeddings
	captions = ["a caption"]
	text_embeds = teacher.text_encoder.encode(captions).to(device=device, dtype=dtype)

	# Prepare video context for VACE conditioning with much smaller video
	# Create a dummy video for depth extraction (B, C, T, H, W) in [-1, 1]
	context_video = torch.randn(batch_size, 3, 2, 16, 16, device=device, dtype=dtype) # Much smaller
	context_video = torch.clamp(context_video, -1, 1) # Ensure it's in [-1, 1] range

	# Prepare VACE conditioning
	vid_context = teacher.prepare_vid_conditioning(context_video)

	# Create condition dict with both text_embeds and vid_context
	condition = {"text_embeds": text_embeds, "vid_context": vid_context}

	# Do an autocasted forward pass
	with torch.autocast(device_type=device.type, dtype=dtype):
	output = teacher(x, t, condition=condition)

	assert output.shape == x.shape # confirm output shape is the same as input shape
	assert output.device == x.device # confirm output device is the same as input device

	teacher = teacher.to(device=device, dtype=dtype)

	# Forward pass without autocast
	output = teacher(x, t, condition=condition)

	output = teacher(x, t, condition=condition, return_features_early=True, feature_indices={0})

	assert isinstance(output, list) # confirm output is a list

	expected_channels = 384 # Original VACE channels (not overridden)
	for feature in output:
	assert feature.shape == (batch_size, expected_channels, 2, 4, 4) # Adjusted dimensions

	# Clear memory after testing
	clear_gpu_memory()


	@RunIf(min_gpus=1)
	def test_network_discriminator_wan():
	"""
	Lightweight unit test for Discriminator_Wan implementation.

	Tests core functionality with minimal memory usage.
	"""
	set_env_vars()

	device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
	dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32

	discriminator_config = Discriminator_Wan_1_3B_Config

	# Use larger dimensions to avoid kernel size issues
	batch_size = 1
	inner_dim = 16
	T, H, W = 8, 8, 8 # Increased from 2, 4, 4 to avoid kernel size errors

	# Create dummy features with larger spatial dimensions
	dummy_features = [torch.randn(batch_size, inner_dim, T, H, W, device=device, dtype=dtype)]

	# Test only the most memory-efficient architectures
	efficient_architectures = [
	"conv3d_down_mlp_efficient",
	"multiscale_down_mlp_efficient",
	]

	# Test single-head discriminator with config-based approach
	for arch_name in efficient_architectures:
	# Configure the discriminator config with smaller parameters
	# Use ++ to force override existing fields
	discriminator_config = override_config_with_opts(
	discriminator_config,
	["-", f"++disc_type={arch_name}", f"++inner_dim={inner_dim}", "++num_blocks=2", "++feature_indices=[0]"],
	)

	# Instantiate using config
	discriminator = instantiate(discriminator_config)

	# Keep on device with appropriate dtype
	discriminator = discriminator.to(device=device, dtype=dtype)

	# Test forward pass
	with torch.no_grad():
	output = discriminator(dummy_features)

	# Basic shape verification
	expected_shape = torch.Size([batch_size, 1])
	assert output.shape == expected_shape, f"Expected {expected_shape}, got {output.shape}"

	# Basic numerical verification
	assert torch.isfinite(output).all(), "Output contains NaN or Inf values"

	# Basic parameter counting (should be reasonable)
	total_params = sum(p.numel() for p in discriminator.parameters() if p.requires_grad)
	assert total_params > 10, f"Too few parameters: {total_params}"
	assert total_params < 50_000_000, f"Too many parameters: {total_params}" # Increased limit for real models

	# Test multi-head discriminator with lightweight config
	discriminator_config = override_config_with_opts(
	discriminator_config,
	[
	"-",
	"++disc_type=factorized_down_mlp_efficient",
	f"++inner_dim={inner_dim}",
	"++num_blocks=2",
	"++feature_indices=[0,1]",
	],
	)

	multi_head_discriminator = instantiate(discriminator_config)
	multi_head_discriminator = multi_head_discriminator.to(device=device, dtype=dtype)

	# Create features for multiple heads (2 heads) with larger dimensions
	multi_head_features = [torch.randn(batch_size, inner_dim, T, H, W, device=device, dtype=dtype) for _ in range(2)]

	with torch.no_grad():
	output = multi_head_discriminator(multi_head_features)

	# Verify output shape for multi-head
	expected_shape = torch.Size([batch_size, 2]) # Two heads output
	assert output.shape == expected_shape, f"Expected {expected_shape}, got {output.shape}"
	assert torch.isfinite(output).all(), "Multi-head output contains NaN or Inf values"


	def test_network_discriminator_edm():
	"""
	Lightweight unit test for Discriminator_EDM implementation.

	Tests core functionality with minimal memory usage for both CIFAR10 and ImageNet64 configurations.
	"""
	set_env_vars()

	device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
	dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32

	# Test configurations
	test_configs = [
	{
	"name": "CIFAR10",
	"config": Discriminator_EDM_CIFAR10_Config,
	"resolutions": [32, 16, 8],
	"in_channels": 256,
	"feature_indices": {0, 1, 2},
	},
	{
	"name": "ImageNet64",
	"config": Discriminator_EDM_ImageNet64_Config,
	"resolutions": [64, 32, 16, 8],
	"in_channels": 768,
	"feature_indices": None, # Will use default (last index)
	},
	]

	batch_size = 1

	for test_case in test_configs:
	print(f"Testing EDM Discriminator {test_case['name']} configuration...")

	config = test_case["config"]
	resolutions = test_case["resolutions"]
	in_channels = test_case["in_channels"]
	feature_indices = test_case["feature_indices"]

	# Use smaller channels for testing to reduce memory usage
	test_in_channels = min(in_channels, 128) # Reduce channel count for testing

	# Override config for lightweight testing
	config = override_config_with_opts(
	config,
	["-", f"in_channels={test_in_channels}", f"all_res={resolutions}"],
	)

	# Instantiate discriminator
	discriminator = instantiate(config)
	discriminator = discriminator.to(device=device, dtype=dtype)

	# Determine which feature indices to test
	if feature_indices is None:
	# Default behavior: use last index
	test_feature_indices = [len(resolutions) - 1]
	else:
	# Use provided indices, but limit to valid range
	test_feature_indices = sorted([i for i in feature_indices if i < len(resolutions)])

	# Create dummy features for the expected resolutions
	# EDM discriminator expects 2D features (H, W) not 3D (T, H, W) like WAN
	dummy_features = []
	for idx in test_feature_indices:
	res = resolutions[idx]
	# Create 2D feature maps: [batch_size, channels, height, width]
	feature = torch.randn(batch_size, test_in_channels, res, res, device=device, dtype=dtype)
	dummy_features.append(feature)

	# Test forward pass
	with torch.no_grad():
	output = discriminator(dummy_features)

	# Verify output shape
	expected_num_heads = len(test_feature_indices)
	expected_shape = torch.Size([batch_size, expected_num_heads])
	assert output.shape == expected_shape, f"Expected {expected_shape}, got {output.shape}"

	# Basic numerical verification
	assert torch.isfinite(output).all(), f"Output contains NaN or Inf values for {test_case['name']}"

	# Output should be reasonable discriminator logits
	assert output.abs().max() < 100, f"Output values seem too large for {test_case['name']}: {output}"


	@RunIf(min_gpus=1)
	@pytest.mark.large_model
	def test_network_causal_wan():
	"""
	Test CausalWan network, specifically the sample method.
	"""
	# Clear memory before starting memory-intensive test
	clear_gpu_memory()

	# Check available GPU memory before loading model
	if torch.cuda.is_available():
	free_memory = (torch.cuda.get_device_properties(0).total_memory - torch.cuda.memory_allocated(0)) / (1024**3)
	if free_memory < 20: # Need ~20GB free for this test
	pytest.skip(f"Test skipped: requires ~20GB free GPU memory, but only {free_memory:.1f}GB available")

	set_env_vars()
	teacher_config = CausalWan_1_3B_Config

	device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
	dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32

	# Instantiate CausalWan and prepare
	teacher = instantiate(teacher_config)
	teacher.init_preprocessors()
	# No dtype cast yet, so we can test autocast first
	teacher = teacher.to(device=device)
	teacher.vae.to(device=device, dtype=dtype)
	teacher.text_encoder.to(device=device, dtype=dtype)

	# RF schedule for WAN
	validate_noise_scheduler_properties(teacher, device, expected_schedule_type="rf")

	batch_size = 1
	C, T, H, W = 16, 3, 4, 4 # Use T divisible by chunk_size=3

	# B, C, T, H, W
	x = torch.randn(batch_size, C, T, H, W, device=device, dtype=dtype)

	# CausalWan supports 2D timesteps with shape (batch_size, num_frames)
	t_1d = teacher.noise_scheduler.sample_t(batch_size, time_dist_type="uniform").to(device=device, dtype=dtype)
	t = t_1d.unsqueeze(1).expand(batch_size, T) # Shape: (batch_size, T)

	captions = ["a test caption for causal WAN"]
	condition = teacher.text_encoder.encode(captions)
	if isinstance(condition, tuple):
	condition = condition[0]
	condition = condition.to(device=device, dtype=dtype)

	# Negative condition for classifier-free guidance path
	neg_condition = teacher.text_encoder.encode([""])
	if isinstance(neg_condition, tuple):
	neg_condition = neg_condition[0]
	neg_condition = neg_condition.to(device=device, dtype=dtype)

	# Do an autocasted forward pass
	with torch.autocast(device_type=device.type, dtype=dtype):
	output = teacher(x, t, condition=condition)

	assert output.shape == x.shape, f"Expected shape {x.shape}, got {output.shape}"
	assert output.device == x.device, f"Expected device {x.device}, got {output.device}"

	teacher = teacher.to(device=device, dtype=dtype)
	# Standard forward pass without autocast
	output = teacher(x, t, condition=condition)

	# Forward with store_kv=True (needed for autoregressive sampling caches)
	output_with_kv = teacher(x, t, condition=condition, store_kv=True)
	assert output_with_kv.shape == x.shape, f"Expected shape {x.shape}, got {output_with_kv.shape}"

	# Test the sample method
	original_noise = torch.randn(batch_size, C, T, H, W, device=device, dtype=dtype)
	with torch.no_grad():
	ar_output = teacher.sample(
	noise=original_noise,
	condition=condition,
	neg_condition=neg_condition,
	)

	assert ar_output.shape == original_noise.shape, f"Expected shape {original_noise.shape}, got {ar_output.shape}"
	assert ar_output.device == original_noise.device, f"Expected device {original_noise.device}, got {ar_output.device}"
	assert ar_output.dtype == original_noise.dtype, f"Expected dtype {original_noise.dtype}, got {ar_output.dtype}"

	# Inhomogeneous timestep sampling and forward process
	t_inhom, idx = teacher.noise_scheduler.sample_t_inhom(batch_size, T, teacher.chunk_size, sample_steps=4)
	t_inhom = t_inhom.to(device=device, dtype=dtype)
	t_inhom_reshaped = t_inhom[:, None, :, None, None] # shape: (batch_size, 1, T, 1, 1) corresponds to (B,C,T,H,W)

	eps_inhom = torch.randn_like(x)
	noisy = teacher.noise_scheduler.forward_process(x, eps_inhom, t_inhom_reshaped)
	assert noisy.shape == x.shape and noisy.device == x.device and noisy.dtype == x.dtype

	# Network forward using inhomogeneous timesteps
	output_inhom = teacher(x, t_inhom, condition=condition)
	assert output_inhom.shape == x.shape

	# Test feature extraction path
	output_features = teacher(x, t, condition=condition, return_features_early=True, feature_indices={0})
	assert isinstance(output_features, list), "Feature extraction should return a list"

	# Verify chunk_size property
	assert hasattr(teacher, "chunk_size"), "CausalWan should have chunk_size attribute"
	assert teacher.chunk_size == 3, f"Expected chunk_size=3, got {teacher.chunk_size}"

	# Edge case: single frame
	single_frame_latents = torch.randn(batch_size, C, 1, H, W, device=device, dtype=dtype)
	with torch.no_grad():
	single_frame_output = teacher.sample(
	noise=single_frame_latents,
	condition=condition,
	neg_condition=neg_condition,
	)
	assert single_frame_output.shape == single_frame_latents.shape

	# Frames with remainder when divided by chunk_size (5 frames, 3-per-chunk => remainder 2)
	odd_frames_latents = torch.randn(batch_size, C, 5, H, W, device=device, dtype=dtype)
	with torch.no_grad():
	odd_output = teacher.sample(
	noise=odd_frames_latents,
	condition=condition,
	neg_condition=neg_condition,
	)
	assert odd_output.shape == odd_frames_latents.shape

	# Clear memory after memory-intensive test
	clear_gpu_memory()


	@RunIf(min_gpus=1)
	@pytest.mark.large_model
	def test_network_wan22_5b_i2v():
	# Clear memory before starting memory-intensive test
	clear_gpu_memory()

	set_env_vars()
	teacher_config = Wan22_I2V_5B_Config

	device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
	dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32

	# Wan config doesn't support model_channels/num_blocks override, use default
	teacher = instantiate(teacher_config)

	# only use a single block for testing
	teacher.transformer.blocks = teacher.transformer.blocks[:1]

	teacher.init_preprocessors()
	# No dtype cast yet, so we can test autocast first
	teacher = teacher.to(device=device)
	teacher.vae.to(device=device, dtype=dtype)
	teacher.text_encoder.to(device=device, dtype=dtype)

	# Test noise scheduler properties comprehensively
	validate_noise_scheduler_properties(teacher, device, expected_schedule_type="rf")

	batch_size = 1
	num_frames, height, width = 5, 32, 32
	(
	T,
	H,
	W,
	) = (num_frames + 3) // 4, height // 16, width // 16

	x = torch.randn(batch_size, 48, T, H, W, device=device, dtype=dtype) # [B, C, T, H, W]

	# Use scheduler to sample valid time steps
	t = teacher.noise_scheduler.sample_t(batch_size, time_dist_type="uniform").to(device=device, dtype=dtype)

	# compute text encoder hidden states
	text_embeds = teacher.text_encoder.encode(["a caption"])

	# compute input for I2V models
	image = torch.zeros(batch_size, 3, height, width, device=device, dtype=dtype) # [B, C, H, W]
	image = image.unsqueeze(2)
	first_frame_cond = image
	first_frame_cond = first_frame_cond.to(device=device, dtype=dtype)
	first_frame_cond = teacher.vae.encode(first_frame_cond)

	# Handle case where text encoder returns tuple or needs device placement
	if isinstance(text_embeds, tuple):
	text_embeds = text_embeds[0] # Take the first element if it's a tuple
	text_embeds = text_embeds.to(device=device, dtype=dtype)
	condition = dict(
	text_embeds=text_embeds,
	first_frame_cond=first_frame_cond,
	)

	# Do an autocasted forward pass
	with torch.autocast(device_type=device.type, dtype=dtype):
	output = teacher(x, t, condition=condition)

	assert output.shape == x.shape # confirm output shape is the same as input shape
	assert output.device == x.device # confirm output device is the same as input device

	teacher = teacher.to(device=device, dtype=dtype)

	# Forward pass without autocast
	output = teacher(x, t, condition=condition)

	output = teacher(
	x,
	t,
	condition=condition,
	return_features_early=True,
	feature_indices={0},
	)

	assert isinstance(output, list) # confirm output is a list

	for feature in output:
	# Use original model channels (not overridden)
	expected_channels = 768 # Original Wan channels
	assert feature.shape == (batch_size, expected_channels, T, H, W)

	# test rf schedule
	teacher_config.schedule_type = "rf"
	teacher = instantiate(teacher_config)
	teacher.init_preprocessors()
	teacher = teacher.to(device=device, dtype=dtype)

	# Test RF schedule properties comprehensively
	validate_noise_scheduler_properties(teacher, device, expected_schedule_type="rf")

	# Explicitly delete the 5B model to free GPU memory immediately
	del teacher

	# Clear memory after memory-intensive test
	clear_gpu_memory()


	@RunIf(min_gpus=1)
	@pytest.mark.large_model
	def test_network_wan21_14b_i2v():
	# Clear memory before starting memory-intensive test
	clear_gpu_memory()

	set_env_vars()
	teacher_config = Wan21_I2V_14B_480P_Config

	device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
	dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32

	# Check available GPU memory before attempting to load 14B model
	if torch.cuda.is_available():
	total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3) # GB
	if total_memory < 79: # 14B model needs ~80GB
	pytest.skip(f"Test skipped: 14B model requires ~80GB GPU memory, but only {total_memory:.1f}GB available")

	# Wan config doesn't support model_channels/num_blocks override, use default
	teacher = instantiate(teacher_config)

	# only use a single block for testing
	teacher.transformer.blocks = teacher.transformer.blocks[:1]

	teacher.init_preprocessors()
	# No dtype cast yet, so we can test autocast first
	teacher = teacher.to(device=device)
	teacher.vae.to(device=device, dtype=dtype)
	teacher.text_encoder.to(device=device, dtype=dtype)
	teacher.image_encoder.to(device=device, dtype=dtype)

	# Test noise scheduler properties comprehensively
	validate_noise_scheduler_properties(teacher, device, expected_schedule_type="rf")

	batch_size = 1
	num_frames, height, width = 5, 32, 32
	(
	T,
	H,
	W,
	) = (num_frames + 3) // 4, height // 8, width // 8

	x = torch.randn(batch_size, 16, T, H, W, device=device, dtype=dtype) # [B, C, T, H, W]

	# Use scheduler to sample valid time steps
	t = teacher.noise_scheduler.sample_t(batch_size, time_dist_type="uniform").to(device=device, dtype=dtype)

	# compute text encoder hidden states
	text_embeds = teacher.text_encoder.encode(["a caption"])

	# compute image encoder hidden states
	image = torch.zeros(batch_size, 3, height, width, device=device, dtype=dtype) # [B, C, H, W]
	encoder_hidden_states_image = teacher.image_encoder.encode(image)

	# compute input for I2V models
	image = image.unsqueeze(2)
	first_frame_cond = image
	# wan 2.1 pads the zero tensor after the first frame
	first_frame_cond = torch.cat(
	[image, image.new_zeros(image.shape[0], image.shape[1], num_frames - 1, height, width)], dim=2
	)
	first_frame_cond = first_frame_cond.to(device=device, dtype=dtype)
	first_frame_cond = teacher.vae.encode(first_frame_cond)

	# Handle case where text encoder returns tuple or needs device placement
	if isinstance(text_embeds, tuple):
	text_embeds = text_embeds[0] # Take the first element if it's a tuple
	text_embeds = text_embeds.to(device=device, dtype=dtype)
	condition = dict(
	text_embeds=text_embeds,
	first_frame_cond=first_frame_cond,
	encoder_hidden_states_image=encoder_hidden_states_image,
	)

	# Do an autocasted forward pass
	with torch.autocast(device_type=device.type, dtype=dtype):
	output = teacher(x, t, condition=condition)

	assert output.shape == x.shape # confirm output shape is the same as input shape
	assert output.device == x.device # confirm output device is the same as input device

	teacher = teacher.to(device=device, dtype=dtype)

	# Forward pass without autocast
	output = teacher(x, t, condition)

	output = teacher(
	x,
	t,
	condition=condition,
	return_features_early=True,
	feature_indices={0},
	)

	assert isinstance(output, list) # confirm output is a list

	for feature in output:
	# Use original model channels (not overridden)
	expected_channels = 1280 # Original Wan channels
	assert feature.shape == (batch_size, expected_channels, T, H, W)

	# test rf schedule
	teacher_config.schedule_type = "rf"
	teacher = instantiate(teacher_config)
	teacher.init_preprocessors()
	teacher = teacher.to(device=device, dtype=dtype)

	# Test RF schedule properties comprehensively
	validate_noise_scheduler_properties(teacher, device, expected_schedule_type="rf")

	# Explicitly delete the large 14B model to free GPU memory immediately
	del teacher

	# Clear memory after memory-intensive test
	clear_gpu_memory()


	@RunIf(min_gpus=1)
	@pytest.mark.large_model
	def test_network_causal_wan22_5b_i2v():
	"""
	Test CausalWanI2V network with Wan 2.2 TI2V 5B model.
	Tests forward pass, sample, and causal-specific features.
	"""
	# Clear memory before starting memory-intensive test
	clear_gpu_memory()

	set_env_vars()
	teacher_config = CausalWan22_I2V_5B_Config

	device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
	dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32

	teacher = instantiate(teacher_config)

	# Only use a single block for testing
	teacher.transformer.blocks = teacher.transformer.blocks[:1]

	teacher.init_preprocessors()
	# No dtype cast yet, so we can test autocast first
	teacher = teacher.to(device=device)
	teacher.vae.to(device=device, dtype=dtype)
	teacher.text_encoder.to(device=device, dtype=dtype)

	# Test noise scheduler properties
	validate_noise_scheduler_properties(teacher, device, expected_schedule_type="rf")

	batch_size = 1
	num_frames, height, width = 5, 32, 32
	T, H, W = (num_frames + 3) // 4, height // 16, width // 16

	x = torch.randn(batch_size, 48, T, H, W, device=device, dtype=dtype) # [B, C, T, H, W]

	# CausalWanI2V supports 2D timesteps with shape (batch_size, num_frames)
	t_1d = teacher.noise_scheduler.sample_t(batch_size, time_dist_type="uniform").to(device=device, dtype=dtype)
	t = t_1d.unsqueeze(1).expand(batch_size, T) # Shape: (batch_size, T)

	# Compute text encoder hidden states
	text_embeds = teacher.text_encoder.encode(["a caption"])

	# Compute input for I2V models (Wan 2.2 uses first frame replacement, not mask concat)
	image = torch.zeros(batch_size, 3, height, width, device=device, dtype=dtype) # [B, C, H, W]
	image = image.unsqueeze(2)
	first_frame_cond = image.to(device=device, dtype=dtype)
	first_frame_cond = teacher.vae.encode(first_frame_cond)

	if isinstance(text_embeds, tuple):
	text_embeds = text_embeds[0]
	text_embeds = text_embeds.to(device=device, dtype=dtype)
	condition = dict(
	text_embeds=text_embeds,
	first_frame_cond=first_frame_cond,
	)

	# Do an autocasted forward pass
	with torch.autocast(device_type=device.type, dtype=dtype):
	output = teacher(x, t, condition)

	assert output.shape == x.shape, f"Expected shape {x.shape}, got {output.shape}"
	assert output.device == x.device, f"Expected device {x.device}, got {output.device}"

	teacher = teacher.to(device=device, dtype=dtype)

	# Forward with store_kv=True (needed for autoregressive sampling caches)
	output_with_kv = teacher(x, t, condition, store_kv=True)
	assert output_with_kv.shape == x.shape, f"Expected shape {x.shape}, got {output_with_kv.shape}"

	# Clear caches before sample
	teacher.clear_caches()

	# Prepare negative condition for classifier-free guidance
	neg_text_embeds = teacher.text_encoder.encode([""])
	if isinstance(neg_text_embeds, tuple):
	neg_text_embeds = neg_text_embeds[0]
	neg_text_embeds = neg_text_embeds.to(device=device, dtype=dtype)
	neg_condition = dict(
	text_embeds=neg_text_embeds,
	first_frame_cond=first_frame_cond,
	)

	# Test sample method
	original_noise = torch.randn(batch_size, 48, T, H, W, device=device, dtype=dtype)
	with torch.no_grad():
	ar_output = teacher.sample(
	noise=original_noise,
	condition=condition,
	neg_condition=neg_condition,
	sample_steps=2, # Use fewer steps for testing
	)

	assert ar_output.shape == original_noise.shape, f"Expected shape {original_noise.shape}, got {ar_output.shape}"
	assert ar_output.device == original_noise.device
	assert ar_output.dtype == original_noise.dtype

	# Test feature extraction
	output_features = teacher(x, t, condition, return_features_early=True, feature_indices={0})
	assert isinstance(output_features, list), "Feature extraction should return a list"

	for feature in output_features:
	expected_channels = 768 # Wan 2.2 5B model channels
	assert feature.shape == (batch_size, expected_channels, T, H, W)

	# Verify chunk_size property
	assert hasattr(teacher, "chunk_size"), "CausalWanI2V should have chunk_size attribute"
	assert teacher.chunk_size == 3, f"Expected chunk_size=3, got {teacher.chunk_size}"

	# Explicitly delete to free GPU memory
	del teacher

	# Clear memory after testing
	clear_gpu_memory()


	@RunIf(min_gpus=1)
	@pytest.mark.large_model
	def test_network_causal_wan21_14b_480p_i2v():
	"""
	Test CausalWanI2V network with Wan 2.1 I2V 14B 480P model.
	Tests forward pass, sample, and causal-specific features.
	"""
	# Clear memory before starting memory-intensive test
	clear_gpu_memory()

	set_env_vars()
	teacher_config = CausalWan21_I2V_14B_480P_Config

	device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
	dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32

	# Check available GPU memory before attempting to load 14B model
	if torch.cuda.is_available():
	total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3) # GB
	if total_memory < 79: # 14B model needs ~80GB
	pytest.skip(f"Test skipped: 14B model requires ~80GB GPU memory, but only {total_memory:.1f}GB available")

	teacher = instantiate(teacher_config)

	# Only use a single block for testing
	teacher.transformer.blocks = teacher.transformer.blocks[:1]

	teacher.init_preprocessors()
	# No dtype cast yet, so we can test autocast first
	teacher = teacher.to(device=device)
	teacher.vae.to(device=device, dtype=dtype)
	teacher.text_encoder.to(device=device, dtype=dtype)
	teacher.image_encoder.to(device=device, dtype=dtype)

	# Test noise scheduler properties
	validate_noise_scheduler_properties(teacher, device, expected_schedule_type="rf")

	batch_size = 1
	num_frames, height, width = 5, 32, 32
	T, H, W = (num_frames + 3) // 4, height // 8, width // 8

	x = torch.randn(batch_size, 16, T, H, W, device=device, dtype=dtype) # [B, C, T, H, W]

	# CausalWanI2V supports 2D timesteps with shape (batch_size, num_frames)
	t_1d = teacher.noise_scheduler.sample_t(batch_size, time_dist_type="uniform").to(device=device, dtype=dtype)
	t = t_1d.unsqueeze(1).expand(batch_size, T) # Shape: (batch_size, T)

	# Compute text encoder hidden states
	text_embeds = teacher.text_encoder.encode(["a caption"])

	# Compute image encoder hidden states (required for Wan 2.1 14B models)
	image = torch.zeros(batch_size, 3, height, width, device=device, dtype=dtype) # [B, C, H, W]
	encoder_hidden_states_image = teacher.image_encoder.encode(image)

	# Compute input for I2V models (Wan 2.1 uses mask concatenation)
	image = image.unsqueeze(2)
	# Wan 2.1 pads zero tensor after the first frame
	first_frame_cond = torch.cat(
	[image, image.new_zeros(image.shape[0], image.shape[1], num_frames - 1, height, width)], dim=2
	)
	first_frame_cond = first_frame_cond.to(device=device, dtype=dtype)
	first_frame_cond = teacher.vae.encode(first_frame_cond)

	if isinstance(text_embeds, tuple):
	text_embeds = text_embeds[0]
	text_embeds = text_embeds.to(device=device, dtype=dtype)
	condition = dict(
	text_embeds=text_embeds,
	first_frame_cond=first_frame_cond,
	encoder_hidden_states_image=encoder_hidden_states_image,
	)

	# Do an autocasted forward pass
	with torch.autocast(device_type=device.type, dtype=dtype):
	output = teacher(x, t, condition)

	assert output.shape == x.shape, f"Expected shape {x.shape}, got {output.shape}"
	assert output.device == x.device, f"Expected device {x.device}, got {output.device}"

	teacher = teacher.to(device=device, dtype=dtype)
	output = teacher(x, t, condition)

	# Forward with store_kv=True (needed for autoregressive sampling caches)
	output_with_kv = teacher(x, t, condition, store_kv=True)
	assert output_with_kv.shape == x.shape, f"Expected shape {x.shape}, got {output_with_kv.shape}"

	# Clear caches before sample
	teacher.clear_caches()

	# Prepare negative condition for classifier-free guidance
	neg_text_embeds = teacher.text_encoder.encode([""])
	if isinstance(neg_text_embeds, tuple):
	neg_text_embeds = neg_text_embeds[0]
	neg_text_embeds = neg_text_embeds.to(device=device, dtype=dtype)
	neg_condition = dict(
	text_embeds=neg_text_embeds,
	first_frame_cond=first_frame_cond,
	encoder_hidden_states_image=encoder_hidden_states_image,
	)

	# Test sample method
	original_noise = torch.randn(batch_size, 16, T, H, W, device=device, dtype=dtype)
	with torch.no_grad():
	ar_output = teacher.sample(
	noise=original_noise,
	condition=condition,
	neg_condition=neg_condition,
	sample_steps=2, # Use fewer steps for testing
	)

	assert ar_output.shape == original_noise.shape, f"Expected shape {original_noise.shape}, got {ar_output.shape}"
	assert ar_output.device == original_noise.device
	assert ar_output.dtype == original_noise.dtype

	# Test feature extraction
	output_features = teacher(x, t, condition, return_features_early=True, feature_indices={0})
	assert isinstance(output_features, list), "Feature extraction should return a list"

	for feature in output_features:
	expected_channels = 1280 # Wan 2.1 14B model channels
	assert feature.shape == (batch_size, expected_channels, T, H, W)

	# Verify chunk_size property
	assert hasattr(teacher, "chunk_size"), "CausalWanI2V should have chunk_size attribute"
	assert teacher.chunk_size == 3, f"Expected chunk_size=3, got {teacher.chunk_size}"

	# Explicitly delete to free GPU memory
	del teacher

	# Clear memory after testing
	clear_gpu_memory()


	@RunIf(min_gpus=1)
	@pytest.mark.large_model
	def test_network_causal_wan21_14b_720p_i2v():
	"""
	Test CausalWanI2V network with Wan 2.1 I2V 14B 720P model.
	Tests forward pass, sample, and causal-specific features.
	"""
	# Clear memory before starting memory-intensive test
	clear_gpu_memory()

	set_env_vars()
	teacher_config = CausalWan21_I2V_14B_720P_Config

	device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
	dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32

	# Check available GPU memory before attempting to load 14B model
	if torch.cuda.is_available():
	total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3) # GB
	if total_memory < 79: # 14B model needs ~80GB
	pytest.skip(f"Test skipped: 14B model requires ~80GB GPU memory, but only {total_memory:.1f}GB available")

	teacher = instantiate(teacher_config)

	# Only use a single block for testing
	teacher.transformer.blocks = teacher.transformer.blocks[:1]

	teacher.init_preprocessors()
	# No dtype cast yet, so we can test autocast first
	teacher = teacher.to(device=device)
	teacher.vae.to(device=device, dtype=dtype)
	teacher.text_encoder.to(device=device, dtype=dtype)
	teacher.image_encoder.to(device=device, dtype=dtype)

	# Test noise scheduler properties
	validate_noise_scheduler_properties(teacher, device, expected_schedule_type="rf")

	batch_size = 1
	num_frames, height, width = 5, 32, 32
	T, H, W = (num_frames + 3) // 4, height // 8, width // 8

	x = torch.randn(batch_size, 16, T, H, W, device=device, dtype=dtype) # [B, C, T, H, W]

	# CausalWanI2V supports 2D timesteps with shape (batch_size, num_frames)
	t_1d = teacher.noise_scheduler.sample_t(batch_size, time_dist_type="uniform").to(device=device, dtype=dtype)
	t = t_1d.unsqueeze(1).expand(batch_size, T) # Shape: (batch_size, T)

	# Compute text encoder hidden states
	text_embeds = teacher.text_encoder.encode(["a caption"])

	# Compute image encoder hidden states (required for Wan 2.1 14B models)
	image = torch.zeros(batch_size, 3, height, width, device=device, dtype=dtype) # [B, C, H, W]
	encoder_hidden_states_image = teacher.image_encoder.encode(image)

	# Compute input for I2V models (Wan 2.1 uses mask concatenation)
	image = image.unsqueeze(2)
	# Wan 2.1 pads zero tensor after the first frame
	first_frame_cond = torch.cat(
	[image, image.new_zeros(image.shape[0], image.shape[1], num_frames - 1, height, width)], dim=2
	)
	first_frame_cond = first_frame_cond.to(device=device, dtype=dtype)
	first_frame_cond = teacher.vae.encode(first_frame_cond)

	if isinstance(text_embeds, tuple):
	text_embeds = text_embeds[0]
	text_embeds = text_embeds.to(device=device, dtype=dtype)
	condition = dict(
	text_embeds=text_embeds,
	first_frame_cond=first_frame_cond,
	encoder_hidden_states_image=encoder_hidden_states_image,
	)

	# Do an autocasted forward pass
	with torch.autocast(device_type=device.type, dtype=dtype):
	output = teacher(x, t, condition)

	assert output.shape == x.shape, f"Expected shape {x.shape}, got {output.shape}"
	assert output.device == x.device, f"Expected device {x.device}, got {output.device}"

	teacher = teacher.to(device=device, dtype=dtype)
	output = teacher(x, t, condition)

	# Forward with store_kv=True (needed for autoregressive sampling caches)
	output_with_kv = teacher(x, t, condition, store_kv=True)
	assert output_with_kv.shape == x.shape, f"Expected shape {x.shape}, got {output_with_kv.shape}"

	# Clear caches before sample
	teacher.clear_caches()

	# Prepare negative condition for classifier-free guidance
	neg_text_embeds = teacher.text_encoder.encode([""])
	if isinstance(neg_text_embeds, tuple):
	neg_text_embeds = neg_text_embeds[0]
	neg_text_embeds = neg_text_embeds.to(device=device, dtype=dtype)
	neg_condition = dict(
	text_embeds=neg_text_embeds,
	first_frame_cond=first_frame_cond,
	encoder_hidden_states_image=encoder_hidden_states_image,
	)

	# Test sample method
	original_noise = torch.randn(batch_size, 16, T, H, W, device=device, dtype=dtype)
	with torch.no_grad():
	ar_output = teacher.sample(
	noise=original_noise,
	condition=condition,
	neg_condition=neg_condition,
	sample_steps=2, # Use fewer steps for testing
	)

	assert ar_output.shape == original_noise.shape, f"Expected shape {original_noise.shape}, got {ar_output.shape}"
	assert ar_output.device == original_noise.device
	assert ar_output.dtype == original_noise.dtype

	# Test feature extraction
	output_features = teacher(x, t, condition, return_features_early=True, feature_indices={0})
	assert isinstance(output_features, list), "Feature extraction should return a list"

	for feature in output_features:
	expected_channels = 1280 # Wan 2.1 14B model channels
	assert feature.shape == (batch_size, expected_channels, T, H, W)

	# Verify chunk_size property
	assert hasattr(teacher, "chunk_size"), "CausalWanI2V should have chunk_size attribute"
	assert teacher.chunk_size == 3, f"Expected chunk_size=3, got {teacher.chunk_size}"

	# Explicitly delete to free GPU memory
	del teacher

	# Clear memory after testing
	clear_gpu_memory()