File size: 4,160 Bytes
75da08b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
import modal
lipsync_image = (
modal.Image.from_registry("nvidia/cuda:12.8.0-devel-ubuntu22.04", add_python="3.11")
.uv_pip_install(
[
"torch",
"torchvision",
"xformers",
"triton",
"diffusers",
"transformers",
"huggingface-hub",
"imageio==2.27.0",
"decord==0.6.0",
"accelerate",
"einops==0.7.0",
"omegaconf==2.3.0",
"safetensors>=0.4.3",
"opencv-python==4.9.0.80",
"mediapipe==0.10.11",
"av==11.0.0",
"torch-fidelity==0.3.0",
"torchmetrics==1.3.1",
"python_speech_features==0.6",
"librosa==0.10.1",
"scenedetect==0.6.1",
"ffmpeg-python==0.2.0",
"lpips==0.1.4",
"face-alignment==1.4.1",
"ninja==1.11.1.1",
"pandas==2.0.3",
"numpy<2",
"pydub==0.25.1",
"moviepy==1.0.3",
"hf-xet==1.1.8"
]
)
.apt_install([
"libgl1",
"curl",
"git",
"wget",
"ffmpeg",
])
.env(
{"HF_HUB_ENABLE_HF_TRANSFER": "1"}
)
.entrypoint([]
)
.add.add_local_dir(
"/home/misha/OpenLipSync/latentsync",
remote_path="/latentsync",
)# remove NVIDIA base container entrypoint
)
# Create the Modal app
app = modal.App("matrix-benchmark")
with lipsync_image.imports():
import torch
import time
# Function to test matrix multiplication speed
def benchmark_matrix_multiplication(size=1024, iterations=10):
"""Benchmark matrix multiplication performance"""
# Create matrices on GPU
a = torch.randn(size, size, device='cuda')
b = torch.randn(size, size, device='cuda')
# Warm up GPU
for _ in range(3):
_ = torch.matmul(a, b)
# Time the operation
start_time = time.time()
for _ in range(iterations):
result = torch.matmul(a, b)
torch.cuda.synchronize() # Ensure operation completes
end_time = time.time()
avg_time = (end_time - start_time) / iterations
return avg_time
# Modal function for matrix multiplication benchmark
@app.function(
image=lipsync_image,
gpu="A100", # Default GPU
timeout=300
)
def benchmark_cpu(size=1024):
"""Benchmark on A100 GPU"""
return {
"gpu": None,
"matrix_size": size,
"avg_time": benchmark_matrix_multiplication(size, 10),
"device": torch.cuda.get_device_name(0)
}
@app.function(
image=lipsync_image,
gpu="A100", # Default GPU
timeout=300
)
def benchmark_a100(size=1024):
"""Benchmark on A100 GPU"""
return {
"gpu": "A100",
"matrix_size": size,
"avg_time": benchmark_matrix_multiplication(size, 10),
"device": torch.cuda.get_device_name(0)
}
@app.function(
image=lipsync_image,
gpu="L40S", # L40S GPU
timeout=300,
)
def benchmark_l40s(size=1024):
import torch
"""Benchmark on L40S GPU"""
return {
"gpu": "L40S",
"matrix_size": size,
"avg_time": benchmark_matrix_multiplication(size, 10),
"device": torch.cuda.get_device_name(0)
}
@app.function(
image=lipsync_image,
gpu="H100", # H100 GPU
timeout=300,
)
def benchmark_h100(size=1024):
import torch
"""Benchmark on H100 GPU"""
return {
"gpu": "H100",
"matrix_size": size,
"avg_time": benchmark_matrix_multiplication(size, 10),
"device": torch.cuda.get_device_name(0)
}
# Main function to run all benchmarks
@app.local_entrypoint()
def main():
"""Run the benchmarks"""
print("Starting matrix multiplication benchmarks...")
# Run all benchmarks
print("\nRunning benchmark across different GPUs...")
print(benchmark_cpu.remote(1024))
# Run benchmarks in parallel
"""
futures = [
benchmark_a100,
benchmark_l40s,
benchmark_h100,
]
# Wait for results
results = []
for future in futures:
print(future.remote(1024))
"""
|