Spaces:
Sleeping
Sleeping
File size: 3,072 Bytes
88a1dd2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 | #!/usr/bin/env python3
"""
End-to-end video generation benchmark for LTX-Video.
Compares performance between:
- Baseline (no optimizations)
- Custom CUDA kernels (optimized RMSNorm)
- torch.compile optimization
Usage:
python benchmark_example.py --use-optimized-kernels
python benchmark_example.py --no-optimized-kernels --compile
python benchmark_example.py --use-optimized-kernels --compile # Mutually exclusive!
"""
import argparse
import sys
import time
import torch
def parse_args():
parser = argparse.ArgumentParser(description="LTX-Video Benchmark")
parser.add_argument("--use-optimized-kernels", action="store_true",
help="Use custom H100 CUDA kernels")
parser.add_argument("--no-optimized-kernels", action="store_true",
help="Use baseline implementation")
parser.add_argument("--compile", action="store_true",
help="Enable torch.compile on transformer")
parser.add_argument("--batch-size", type=int, default=1,
help="Number of videos per prompt")
parser.add_argument("--num-frames", type=int, default=161,
help="Number of frames to generate")
parser.add_argument("--height", type=int, default=512,
help="Video height in pixels")
parser.add_argument("--width", type=int, default=768,
help="Video width in pixels")
parser.add_argument("--steps", type=int, default=50,
help="Denoising steps")
parser.add_argument("--warmup-iterations", type=int, default=2,
help="Warmup runs before benchmark")
return parser.parse_args()
def main():
args = parse_args()
if args.use_optimized_kernels and args.compile:
print("WARNING: --use-optimized-kernels and --compile are mutually exclusive.")
print("Custom kernels require PyTorch custom op registration to work with torch.compile.")
sys.exit(1)
print("=" * 60)
print("LTX-Video Benchmark")
print("=" * 60)
print(f"Configuration:")
print(f" Optimized kernels: {args.use_optimized_kernels}")
print(f" torch.compile: {args.compile}")
print(f" Batch size: {args.batch_size}")
print(f" Frames: {args.num_frames}")
print(f" Resolution: {args.width}x{args.height}")
print(f" Steps: {args.steps}")
print(f" Warmup iterations: {args.warmup_iterations}")
if not torch.cuda.is_available():
print("\nCUDA not available. This benchmark requires a GPU.")
sys.exit(1)
print(f"\nDevice: {torch.cuda.get_device_name()}")
print(f"Capability: {torch.cuda.get_device_capability()}")
# NOTE: Full implementation requires diffusers and ltx_kernels
print("\nThis is a reference benchmark script.")
print("To run the full benchmark, install diffusers and build the CUDA kernels:")
print(" pip install diffusers transformers accelerate")
print(" cd examples/ltx_video && uv pip install -e .")
if __name__ == "__main__":
main()
|