#!/usr/bin/env python3
"""
End-to-end video generation benchmark for LTX-Video.

Compares performance between:
- Baseline (no optimizations)
- Custom CUDA kernels (optimized RMSNorm)
- torch.compile optimization

Usage:
    python benchmark_example.py --use-optimized-kernels
    python benchmark_example.py --no-optimized-kernels --compile
    python benchmark_example.py --use-optimized-kernels --compile  # Mutually exclusive!
"""

import argparse
import sys
import time

import torch


def parse_args():
    parser = argparse.ArgumentParser(description="LTX-Video Benchmark")
    parser.add_argument("--use-optimized-kernels", action="store_true",
                        help="Use custom H100 CUDA kernels")
    parser.add_argument("--no-optimized-kernels", action="store_true",
                        help="Use baseline implementation")
    parser.add_argument("--compile", action="store_true",
                        help="Enable torch.compile on transformer")
    parser.add_argument("--batch-size", type=int, default=1,
                        help="Number of videos per prompt")
    parser.add_argument("--num-frames", type=int, default=161,
                        help="Number of frames to generate")
    parser.add_argument("--height", type=int, default=512,
                        help="Video height in pixels")
    parser.add_argument("--width", type=int, default=768,
                        help="Video width in pixels")
    parser.add_argument("--steps", type=int, default=50,
                        help="Denoising steps")
    parser.add_argument("--warmup-iterations", type=int, default=2,
                        help="Warmup runs before benchmark")
    return parser.parse_args()


def main():
    args = parse_args()

    if args.use_optimized_kernels and args.compile:
        print("WARNING: --use-optimized-kernels and --compile are mutually exclusive.")
        print("Custom kernels require PyTorch custom op registration to work with torch.compile.")
        sys.exit(1)

    print("=" * 60)
    print("LTX-Video Benchmark")
    print("=" * 60)
    print(f"Configuration:")
    print(f"  Optimized kernels: {args.use_optimized_kernels}")
    print(f"  torch.compile: {args.compile}")
    print(f"  Batch size: {args.batch_size}")
    print(f"  Frames: {args.num_frames}")
    print(f"  Resolution: {args.width}x{args.height}")
    print(f"  Steps: {args.steps}")
    print(f"  Warmup iterations: {args.warmup_iterations}")

    if not torch.cuda.is_available():
        print("\nCUDA not available. This benchmark requires a GPU.")
        sys.exit(1)

    print(f"\nDevice: {torch.cuda.get_device_name()}")
    print(f"Capability: {torch.cuda.get_device_capability()}")

    # NOTE: Full implementation requires diffusers and ltx_kernels
    print("\nThis is a reference benchmark script.")
    print("To run the full benchmark, install diffusers and build the CUDA kernels:")
    print("  pip install diffusers transformers accelerate")
    print("  cd examples/ltx_video && uv pip install -e .")


if __name__ == "__main__":
    main()