File size: 2,678 Bytes
712dbf0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# Copyright © 2023-2024 Apple Inc.

import argparse

import mlx.core as mx
import torch
from time_utils import measure_runtime


def benchmark_scatter_mlx(dst_shape, x_shape, idx_shapes):
    def scatter(dst, x, idx):
        dst[tuple(idx)] = x
        mx.eval(dst)

    idx = []
    for idx_shape in idx_shapes:
        idx.append(mx.random.randint(0, dst_shape[0] - 1, idx_shape))
    x = mx.random.normal(x_shape).astype(mx.float32)
    dst = mx.random.normal(dst_shape).astype(mx.float32)

    runtime = measure_runtime(scatter, dst=dst, x=x, idx=idx)
    print(f"MLX: {runtime:.3f}ms")


def benchmark_scatter_torch(dst_shape, x_shape, idx_shapes, device):
    def scatter(dst, x, idx, device):
        dst[tuple(idx)] = x
        if device == torch.device("mps"):
            torch.mps.synchronize()

    idx = []
    for idx_shape in idx_shapes:
        idx.append(torch.randint(0, dst_shape[0] - 1, idx_shape).to(device))
    x = torch.randn(x_shape, dtype=torch.float32).to(device)
    dst = torch.randn(dst_shape, dtype=torch.float32).to(device)

    runtime = measure_runtime(scatter, dst=dst, x=x, idx=idx, device=device)
    print(f"PyTorch: {runtime:.3f}ms")


if __name__ == "__main__":
    parser = argparse.ArgumentParser("Gather benchmarks.")
    parser.add_argument("--cpu", action="store_true", help="Use the CPU.")
    args = parser.parse_args()

    if args.cpu:
        mx.set_default_device(mx.cpu)
        device = torch.device("cpu")
    else:
        device = torch.device("mps")

    dst_shapes = [
        (10, 64),
        (100_000, 64),
        (1_000_000, 64),
        (100_000,),
        (200_000,),
        (20_000_000,),
        (10000, 64),
        (100, 64),
        (100, 10_000, 64),
        (10, 100, 100, 21),
        (1_000, 1_000, 10),
    ]
    idx_shapes = [
        [(1_000_000,)],
        [(1_000_000,)],
        [(100_000,)],
        [(1_000_000,)],
        [(20_000_000,)],
        [(20_000_000,)],
        [(1000000,)],
        [(10000000,)],
        [(1_000,)],
        [(10_000,)],
        [(1_000,), (1_000,)],
    ]
    x_shapes = [
        (1_000_000, 64),
        (1_000_000, 64),
        (100_000, 64),
        (1_000_000,),
        (20_000_000,),
        (20_000_000,),
        (1000000, 64),
        (10000000, 64),
        (1_000, 10_000, 64),
        (10_000, 100, 100, 21),
        (1_000, 10),
    ]

    for dst_shape, x_shape, idx_shape in zip(dst_shapes, x_shapes, idx_shapes):
        print("=" * 20)
        print(f"Dst: {dst_shape}, X {x_shape}, Indices {idx_shape}")
        benchmark_scatter_mlx(dst_shape, x_shape, idx_shape)
        benchmark_scatter_torch(dst_shape, x_shape, idx_shape, device=device)