import modal lipsync_image = ( modal.Image.from_registry("nvidia/cuda:12.8.0-devel-ubuntu22.04", add_python="3.11") .uv_pip_install( [ "torch", "torchvision", "xformers", "triton", "diffusers", "transformers", "huggingface-hub", "imageio==2.27.0", "decord==0.6.0", "accelerate", "einops==0.7.0", "omegaconf==2.3.0", "safetensors>=0.4.3", "opencv-python==4.9.0.80", "mediapipe==0.10.11", "av==11.0.0", "torch-fidelity==0.3.0", "torchmetrics==1.3.1", "python_speech_features==0.6", "librosa==0.10.1", "scenedetect==0.6.1", "ffmpeg-python==0.2.0", "lpips==0.1.4", "face-alignment==1.4.1", "ninja==1.11.1.1", "pandas==2.0.3", "numpy<2", "pydub==0.25.1", "moviepy==1.0.3", "hf-xet==1.1.8" ] ) .apt_install([ "libgl1", "curl", "git", "wget", "ffmpeg", ]) .env( {"HF_HUB_ENABLE_HF_TRANSFER": "1"} ) .entrypoint([] ) .add.add_local_dir( "/home/misha/OpenLipSync/latentsync", remote_path="/latentsync", )# remove NVIDIA base container entrypoint ) # Create the Modal app app = modal.App("matrix-benchmark") with lipsync_image.imports(): import torch import time # Function to test matrix multiplication speed def benchmark_matrix_multiplication(size=1024, iterations=10): """Benchmark matrix multiplication performance""" # Create matrices on GPU a = torch.randn(size, size, device='cuda') b = torch.randn(size, size, device='cuda') # Warm up GPU for _ in range(3): _ = torch.matmul(a, b) # Time the operation start_time = time.time() for _ in range(iterations): result = torch.matmul(a, b) torch.cuda.synchronize() # Ensure operation completes end_time = time.time() avg_time = (end_time - start_time) / iterations return avg_time # Modal function for matrix multiplication benchmark @app.function( image=lipsync_image, gpu="A100", # Default GPU timeout=300 ) def benchmark_cpu(size=1024): """Benchmark on A100 GPU""" return { "gpu": None, "matrix_size": size, "avg_time": benchmark_matrix_multiplication(size, 10), "device": torch.cuda.get_device_name(0) } @app.function( image=lipsync_image, gpu="A100", # Default GPU timeout=300 ) def benchmark_a100(size=1024): """Benchmark on A100 GPU""" return { "gpu": "A100", "matrix_size": size, "avg_time": benchmark_matrix_multiplication(size, 10), "device": torch.cuda.get_device_name(0) } @app.function( image=lipsync_image, gpu="L40S", # L40S GPU timeout=300, ) def benchmark_l40s(size=1024): import torch """Benchmark on L40S GPU""" return { "gpu": "L40S", "matrix_size": size, "avg_time": benchmark_matrix_multiplication(size, 10), "device": torch.cuda.get_device_name(0) } @app.function( image=lipsync_image, gpu="H100", # H100 GPU timeout=300, ) def benchmark_h100(size=1024): import torch """Benchmark on H100 GPU""" return { "gpu": "H100", "matrix_size": size, "avg_time": benchmark_matrix_multiplication(size, 10), "device": torch.cuda.get_device_name(0) } # Main function to run all benchmarks @app.local_entrypoint() def main(): """Run the benchmarks""" print("Starting matrix multiplication benchmarks...") # Run all benchmarks print("\nRunning benchmark across different GPUs...") print(benchmark_cpu.remote(1024)) # Run benchmarks in parallel """ futures = [ benchmark_a100, benchmark_l40s, benchmark_h100, ] # Wait for results results = [] for future in futures: print(future.remote(1024)) """