import modal
lipsync_image = (
    modal.Image.from_registry("nvidia/cuda:12.8.0-devel-ubuntu22.04", add_python="3.11")
    .uv_pip_install(
        [
            "torch",
            "torchvision",
            "xformers",
            "triton",
            "diffusers",
            "transformers",
            "huggingface-hub",
            "imageio==2.27.0",
            "decord==0.6.0",
            "accelerate",
            "einops==0.7.0",
            "omegaconf==2.3.0",
            "safetensors>=0.4.3",
            "opencv-python==4.9.0.80",
            "mediapipe==0.10.11",
            "av==11.0.0",
            "torch-fidelity==0.3.0",
            "torchmetrics==1.3.1",
            "python_speech_features==0.6",
            "librosa==0.10.1",
            "scenedetect==0.6.1",
            "ffmpeg-python==0.2.0",
            "lpips==0.1.4",
            "face-alignment==1.4.1",
            "ninja==1.11.1.1",
            "pandas==2.0.3",
            "numpy<2",
            "pydub==0.25.1",
            "moviepy==1.0.3",
            "hf-xet==1.1.8"
        ]
    )
    .apt_install([
        "libgl1",
        "curl",
        "git",
        "wget",
        "ffmpeg",
    ])
    .env(
        {"HF_HUB_ENABLE_HF_TRANSFER": "1"}
    )
    .entrypoint([]
    )
    .add.add_local_dir(
    "/home/misha/OpenLipSync/latentsync",
    remote_path="/latentsync",
    )# remove NVIDIA base container entrypoint
)

# Create the Modal app
app = modal.App("matrix-benchmark")
with lipsync_image.imports():
    import torch
    import time

# Function to test matrix multiplication speed
def benchmark_matrix_multiplication(size=1024, iterations=10):
    """Benchmark matrix multiplication performance"""
    # Create matrices on GPU
    a = torch.randn(size, size, device='cuda')
    b = torch.randn(size, size, device='cuda')
    
    # Warm up GPU
    for _ in range(3):
        _ = torch.matmul(a, b)
    
    # Time the operation
    start_time = time.time()
    
    for _ in range(iterations):
        result = torch.matmul(a, b)
        torch.cuda.synchronize()  # Ensure operation completes
    
    end_time = time.time()
    
    avg_time = (end_time - start_time) / iterations
    return avg_time

# Modal function for matrix multiplication benchmark
@app.function(
    image=lipsync_image,
    gpu="A100",  # Default GPU
    timeout=300
)
def benchmark_cpu(size=1024):
    """Benchmark on A100 GPU"""
    return {
        "gpu": None,
        "matrix_size": size,
        "avg_time": benchmark_matrix_multiplication(size, 10),
        "device": torch.cuda.get_device_name(0)
    }

@app.function(
    image=lipsync_image,
    gpu="A100",  # Default GPU
    timeout=300
)
def benchmark_a100(size=1024):
    """Benchmark on A100 GPU"""
    return {
        "gpu": "A100",
        "matrix_size": size,
        "avg_time": benchmark_matrix_multiplication(size, 10),
        "device": torch.cuda.get_device_name(0)
    }

@app.function(
    image=lipsync_image,
    gpu="L40S",  # L40S GPU
    timeout=300,
)
def benchmark_l40s(size=1024):
    import torch
    """Benchmark on L40S GPU"""
    return {
        "gpu": "L40S",
        "matrix_size": size,
        "avg_time": benchmark_matrix_multiplication(size, 10),
        "device": torch.cuda.get_device_name(0)
    }

@app.function(
    image=lipsync_image,
    gpu="H100",  # H100 GPU
    timeout=300,
)
def benchmark_h100(size=1024):
    import torch
    """Benchmark on H100 GPU"""
    return {
        "gpu": "H100",
        "matrix_size": size,
        "avg_time": benchmark_matrix_multiplication(size, 10),
        "device": torch.cuda.get_device_name(0)
    }

# Main function to run all benchmarks
@app.local_entrypoint()
def main():
    """Run the benchmarks"""
    print("Starting matrix multiplication benchmarks...")
    
    # Run all benchmarks
    print("\nRunning benchmark across different GPUs...")
    print(benchmark_cpu.remote(1024))
    # Run benchmarks in parallel
    """
    futures = [
        benchmark_a100,
        benchmark_l40s,
        benchmark_h100,
    ]
    # Wait for results
    results = []
    for future in futures:
        print(future.remote(1024))
    """