Zenderos / evaluate.py

Upload 11 files

3d8856d verified 24 days ago

8.99 kB

	"""
	Model evaluation and testing utilities for TTV-1B
	"""

	import torch
	import torch.nn as nn
	from video_ttv_1b import VideoTTV1B, create_model
	import time
	from typing import Dict, Tuple
	import numpy as np


	def count_parameters(model: nn.Module) -> Dict[str, int]:
	"""Count parameters by component"""
	total = 0
	breakdown = {}

	# Text encoder
	text_params = sum(p.numel() for p in model.text_encoder.parameters())
	breakdown['text_encoder'] = text_params
	total += text_params

	# Patch embedding
	patch_params = sum(p.numel() for p in model.patch_embed.parameters())
	breakdown['patch_embed'] = patch_params
	total += patch_params

	# DiT blocks
	dit_params = sum(p.numel() for p in model.blocks.parameters())
	breakdown['dit_blocks'] = dit_params
	total += dit_params

	# Other
	other_params = sum(p.numel() for p in model.parameters()) - total
	breakdown['other'] = other_params
	total += other_params

	breakdown['total'] = total

	return breakdown


	def measure_inference_speed(
	model: nn.Module,
	batch_size: int = 1,
	num_iterations: int = 10,
	device: str = 'cuda',
	) -> Dict[str, float]:
	"""Measure inference speed"""
	model.eval()

	# Prepare dummy inputs
	videos = torch.randn(batch_size, 3, 16, 256, 256).to(device)
	timesteps = torch.randint(0, 1000, (batch_size,)).to(device)
	text_tokens = torch.randint(0, 50257, (batch_size, 256)).to(device)

	# Warmup
	with torch.no_grad():
	for _ in range(3):
	_ = model(videos, timesteps, text_tokens)

	# Measure
	if device == 'cuda':
	torch.cuda.synchronize()

	start_time = time.time()

	with torch.no_grad():
	for _ in range(num_iterations):
	_ = model(videos, timesteps, text_tokens)
	if device == 'cuda':
	torch.cuda.synchronize()

	end_time = time.time()

	total_time = end_time - start_time
	avg_time = total_time / num_iterations
	throughput = batch_size / avg_time

	return {
	'total_time': total_time,
	'avg_time_per_batch': avg_time,
	'throughput': throughput,
	'time_per_sample': avg_time / batch_size,
	}


	def measure_memory_usage(
	model: nn.Module,
	batch_size: int = 1,
	device: str = 'cuda',
	) -> Dict[str, float]:
	"""Measure memory usage"""
	if device != 'cuda':
	return {'error': 'Memory measurement only available on CUDA'}

	torch.cuda.reset_peak_memory_stats()
	torch.cuda.empty_cache()

	# Model memory
	model_memory = sum(p.numel() * p.element_size() for p in model.parameters())
	model_memory_mb = model_memory / (1024 ** 2)

	# Forward pass memory
	videos = torch.randn(batch_size, 3, 16, 256, 256).to(device)
	timesteps = torch.randint(0, 1000, (batch_size,)).to(device)
	text_tokens = torch.randint(0, 50257, (batch_size, 256)).to(device)

	torch.cuda.reset_peak_memory_stats()

	with torch.no_grad():
	_ = model(videos, timesteps, text_tokens)

	peak_memory = torch.cuda.max_memory_allocated()
	peak_memory_mb = peak_memory / (1024 ** 2)

	return {
	'model_memory_mb': model_memory_mb,
	'peak_memory_mb': peak_memory_mb,
	'activation_memory_mb': peak_memory_mb - model_memory_mb,
	}


	def test_model_correctness(model: nn.Module, device: str = 'cuda') -> bool:
	"""Test model correctness with various inputs"""
	model.eval()

	tests_passed = 0
	total_tests = 0

	# Test 1: Output shape
	total_tests += 1
	x = torch.randn(2, 3, 16, 256, 256).to(device)
	t = torch.randint(0, 1000, (2,)).to(device)
	tokens = torch.randint(0, 50257, (2, 256)).to(device)

	with torch.no_grad():
	output = model(x, t, tokens)

	if output.shape == x.shape:
	tests_passed += 1
	print("✓ Test 1 passed: Output shape matches input")
	else:
	print(f"✗ Test 1 failed: Expected {x.shape}, got {output.shape}")

	# Test 2: No NaN values
	total_tests += 1
	if not torch.isnan(output).any():
	tests_passed += 1
	print("✓ Test 2 passed: No NaN values in output")
	else:
	print("✗ Test 2 failed: NaN values detected in output")

	# Test 3: Different timesteps produce different outputs
	total_tests += 1
	t1 = torch.full((2,), 0).to(device)
	t2 = torch.full((2,), 999).to(device)

	with torch.no_grad():
	out1 = model(x, t1, tokens)
	out2 = model(x, t2, tokens)

	if not torch.allclose(out1, out2, rtol=1e-3):
	tests_passed += 1
	print("✓ Test 3 passed: Different timesteps produce different outputs")
	else:
	print("✗ Test 3 failed: Outputs identical for different timesteps")

	# Test 4: Different text produces different outputs
	total_tests += 1
	tokens1 = torch.randint(0, 50257, (2, 256)).to(device)
	tokens2 = torch.randint(0, 50257, (2, 256)).to(device)

	with torch.no_grad():
	out1 = model(x, t, tokens1)
	out2 = model(x, t, tokens2)

	if not torch.allclose(out1, out2, rtol=1e-3):
	tests_passed += 1
	print("✓ Test 4 passed: Different text produces different outputs")
	else:
	print("✗ Test 4 failed: Outputs identical for different text")

	# Test 5: Gradient flow (training mode)
	total_tests += 1
	model.train()
	x.requires_grad = True
	output = model(x, t, tokens)
	loss = output.mean()
	loss.backward()

	if x.grad is not None and not torch.isnan(x.grad).any():
	tests_passed += 1
	print("✓ Test 5 passed: Gradients computed correctly")
	else:
	print("✗ Test 5 failed: Gradient computation error")

	model.eval()

	print(f"\nTests passed: {tests_passed}/{total_tests}")
	return tests_passed == total_tests


	def benchmark_full_pipeline(device: str = 'cuda'):
	"""Comprehensive benchmark of the model"""
	print("="*60)
	print("TTV-1B Model Benchmark")
	print("="*60)

	# Create model
	print("\n1. Creating model...")
	model = create_model(device)
	print(f" Device: {device}")

	# Count parameters
	print("\n2. Parameter count:")
	param_counts = count_parameters(model)
	for name, count in param_counts.items():
	print(f" {name:20s}: {count:>12,} ({count/1e6:>6.1f}M)")

	# Memory usage
	if device == 'cuda':
	print("\n3. Memory usage:")
	mem_stats = measure_memory_usage(model, batch_size=1, device=device)
	for name, value in mem_stats.items():
	print(f" {name:25s}: {value:>8.1f} MB")

	# Inference speed
	print("\n4. Inference speed:")
	speed_stats = measure_inference_speed(model, batch_size=1, num_iterations=10, device=device)
	print(f" Average time per batch: {speed_stats['avg_time_per_batch']:.3f} seconds")
	print(f" Time per sample: {speed_stats['time_per_sample']:.3f} seconds")
	print(f" Throughput: {speed_stats['throughput']:.2f} samples/sec")

	# Correctness tests
	print("\n5. Correctness tests:")
	all_passed = test_model_correctness(model, device)

	print("\n" + "="*60)
	if all_passed:
	print("✓ All tests passed!")
	else:
	print("✗ Some tests failed")
	print("="*60)


	def estimate_training_time(
	num_samples: int = 1_000_000,
	batch_size: int = 16,
	num_epochs: int = 100,
	seconds_per_batch: float = 2.0,
	) -> Dict[str, float]:
	"""Estimate training time"""
	steps_per_epoch = num_samples // batch_size
	total_steps = steps_per_epoch * num_epochs
	total_seconds = total_steps * seconds_per_batch

	return {
	'steps_per_epoch': steps_per_epoch,
	'total_steps': total_steps,
	'total_hours': total_seconds / 3600,
	'total_days': total_seconds / (3600 * 24),
	}


	if __name__ == "__main__":
	# Run full benchmark
	device = 'cuda' if torch.cuda.is_available() else 'cpu'
	benchmark_full_pipeline(device)

	# Training time estimates
	print("\n" + "="*60)
	print("Training Time Estimates")
	print("="*60)

	configs = [
	{'name': 'Single A100 (bs=2, grad_accum=8)', 'batch_size': 16, 'seconds_per_batch': 3.0},
	{'name': '8x A100 (bs=16, grad_accum=8)', 'batch_size': 128, 'seconds_per_batch': 3.0},
	]

	for config in configs:
	print(f"\n{config['name']}:")
	estimates = estimate_training_time(
	num_samples=10_000_000,
	batch_size=config['batch_size'],
	num_epochs=10,
	seconds_per_batch=config['seconds_per_batch'],
	)
	print(f" Steps per epoch: {estimates['steps_per_epoch']:,}")
	print(f" Total steps: {estimates['total_steps']:,}")
	print(f" Estimated time: {estimates['total_days']:.1f} days ({estimates['total_hours']:.1f} hours)")