File size: 4,160 Bytes
75da08b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import modal
lipsync_image = (
    modal.Image.from_registry("nvidia/cuda:12.8.0-devel-ubuntu22.04", add_python="3.11")
    .uv_pip_install(
        [
            "torch",
            "torchvision",
            "xformers",
            "triton",
            "diffusers",
            "transformers",
            "huggingface-hub",
            "imageio==2.27.0",
            "decord==0.6.0",
            "accelerate",
            "einops==0.7.0",
            "omegaconf==2.3.0",
            "safetensors>=0.4.3",
            "opencv-python==4.9.0.80",
            "mediapipe==0.10.11",
            "av==11.0.0",
            "torch-fidelity==0.3.0",
            "torchmetrics==1.3.1",
            "python_speech_features==0.6",
            "librosa==0.10.1",
            "scenedetect==0.6.1",
            "ffmpeg-python==0.2.0",
            "lpips==0.1.4",
            "face-alignment==1.4.1",
            "ninja==1.11.1.1",
            "pandas==2.0.3",
            "numpy<2",
            "pydub==0.25.1",
            "moviepy==1.0.3",
            "hf-xet==1.1.8"
        ]
    )
    .apt_install([
        "libgl1",
        "curl",
        "git",
        "wget",
        "ffmpeg",
    ])
    .env(
        {"HF_HUB_ENABLE_HF_TRANSFER": "1"}
    )
    .entrypoint([]
    )
    .add.add_local_dir(
    "/home/misha/OpenLipSync/latentsync",
    remote_path="/latentsync",
    )# remove NVIDIA base container entrypoint
)

# Create the Modal app
app = modal.App("matrix-benchmark")
with lipsync_image.imports():
    import torch
    import time

# Function to test matrix multiplication speed
def benchmark_matrix_multiplication(size=1024, iterations=10):
    """Benchmark matrix multiplication performance"""
    # Create matrices on GPU
    a = torch.randn(size, size, device='cuda')
    b = torch.randn(size, size, device='cuda')
    
    # Warm up GPU
    for _ in range(3):
        _ = torch.matmul(a, b)
    
    # Time the operation
    start_time = time.time()
    
    for _ in range(iterations):
        result = torch.matmul(a, b)
        torch.cuda.synchronize()  # Ensure operation completes
    
    end_time = time.time()
    
    avg_time = (end_time - start_time) / iterations
    return avg_time

# Modal function for matrix multiplication benchmark
@app.function(
    image=lipsync_image,
    gpu="A100",  # Default GPU
    timeout=300
)
def benchmark_cpu(size=1024):
    """Benchmark on A100 GPU"""
    return {
        "gpu": None,
        "matrix_size": size,
        "avg_time": benchmark_matrix_multiplication(size, 10),
        "device": torch.cuda.get_device_name(0)
    }

@app.function(
    image=lipsync_image,
    gpu="A100",  # Default GPU
    timeout=300
)
def benchmark_a100(size=1024):
    """Benchmark on A100 GPU"""
    return {
        "gpu": "A100",
        "matrix_size": size,
        "avg_time": benchmark_matrix_multiplication(size, 10),
        "device": torch.cuda.get_device_name(0)
    }

@app.function(
    image=lipsync_image,
    gpu="L40S",  # L40S GPU
    timeout=300,
)
def benchmark_l40s(size=1024):
    import torch
    """Benchmark on L40S GPU"""
    return {
        "gpu": "L40S",
        "matrix_size": size,
        "avg_time": benchmark_matrix_multiplication(size, 10),
        "device": torch.cuda.get_device_name(0)
    }

@app.function(
    image=lipsync_image,
    gpu="H100",  # H100 GPU
    timeout=300,
)
def benchmark_h100(size=1024):
    import torch
    """Benchmark on H100 GPU"""
    return {
        "gpu": "H100",
        "matrix_size": size,
        "avg_time": benchmark_matrix_multiplication(size, 10),
        "device": torch.cuda.get_device_name(0)
    }

# Main function to run all benchmarks
@app.local_entrypoint()
def main():
    """Run the benchmarks"""
    print("Starting matrix multiplication benchmarks...")
    
    # Run all benchmarks
    print("\nRunning benchmark across different GPUs...")
    print(benchmark_cpu.remote(1024))
    # Run benchmarks in parallel
    """
    futures = [
        benchmark_a100,
        benchmark_l40s,
        benchmark_h100,
    ]
    # Wait for results
    results = []
    for future in futures:
        print(future.remote(1024))
    """