Fahad-S
/

FastVLM_SANA

Model card Files Files and versions

FastVLM_SANA / ml-stable-diffusion /mlx /benchmarks /python /fft_bench.py

Fahad-S's picture

Upload folder using huggingface_hub

712dbf0 verified 5 months ago

3.5 kB

	# Copyright © 2024 Apple Inc.

	import matplotlib
	import mlx.core as mx
	import numpy as np
	import sympy
	import torch
	from time_utils import measure_runtime

	matplotlib.use("Agg")
	import matplotlib.pyplot as plt


	def bandwidth_gb(runtime_ms, system_size):
	bytes_per_fft = np.dtype(np.complex64).itemsize * 2
	bytes_per_gb = 1e9
	ms_per_s = 1e3
	return system_size * bytes_per_fft / runtime_ms * ms_per_s / bytes_per_gb


	def run_bench(system_size, fft_sizes, backend="mlx", dim=1):
	def fft_mlx(x):
	if dim == 1:
	out = mx.fft.fft(x)
	elif dim == 2:
	out = mx.fft.fft2(x)
	mx.eval(out)
	return out

	def fft_mps(x):
	if dim == 1:
	out = torch.fft.fft(x)
	elif dim == 2:
	out = torch.fft.fft2(x)
	torch.mps.synchronize()
	return out

	bandwidths = []
	for n in fft_sizes:
	batch_size = system_size // n**dim
	shape = [batch_size] + [n for _ in range(dim)]
	if backend == "mlx":
	x_np = np.random.uniform(size=(system_size // n, n)).astype(np.complex64)
	x = mx.array(x_np)
	mx.eval(x)
	fft = fft_mlx
	elif backend == "mps":
	x_np = np.random.uniform(size=(system_size // n, n)).astype(np.complex64)
	x = torch.tensor(x_np, device="mps")
	torch.mps.synchronize()
	fft = fft_mps
	else:
	raise NotImplementedError()
	runtime_ms = measure_runtime(fft, x=x)
	bandwidth = bandwidth_gb(runtime_ms, np.prod(shape))
	print(n, bandwidth)
	bandwidths.append(bandwidth)

	return np.array(bandwidths)


	def time_fft():
	x = np.array(range(2, 512))
	system_size = int(2**26)

	print("MLX GPU")
	with mx.stream(mx.gpu):
	gpu_bandwidths = run_bench(system_size=system_size, fft_sizes=x)

	print("MPS GPU")
	mps_bandwidths = run_bench(system_size=system_size, fft_sizes=x, backend="mps")

	print("CPU")
	system_size = int(2**20)
	with mx.stream(mx.cpu):
	cpu_bandwidths = run_bench(system_size=system_size, fft_sizes=x)

	x = np.array(x)

	all_indices = x - x[0]
	radix_2to13 = (
	np.array([i for i in x if all(p <= 13 for p in sympy.primefactors(i))]) - x[0]
	)
	bluesteins = (
	np.array([i for i in x if any(p > 13 for p in sympy.primefactors(i))]) - x[0]
	)

	for indices, name in [
	(all_indices, "All"),
	(radix_2to13, "Radix 2-13"),
	(bluesteins, "Bluestein's"),
	]:
	# plot bandwidths
	print(name)
	plt.scatter(x[indices], gpu_bandwidths[indices], color="green", label="GPU")
	plt.scatter(x[indices], mps_bandwidths[indices], color="blue", label="MPS")
	plt.scatter(x[indices], cpu_bandwidths[indices], color="red", label="CPU")
	plt.title(f"MLX FFT Benchmark -- {name}")
	plt.xlabel("N")
	plt.ylabel("Bandwidth (GB/s)")
	plt.legend()
	plt.savefig(f"{name}.png")
	plt.clf()

	av_gpu_bandwidth = np.mean(gpu_bandwidths)
	av_mps_bandwidth = np.mean(mps_bandwidths)
	av_cpu_bandwidth = np.mean(cpu_bandwidths)
	print("Average bandwidths:")
	print("GPU:", av_gpu_bandwidth)
	print("MPS:", av_mps_bandwidth)
	print("CPU:", av_cpu_bandwidth)

	portion_faster = len(np.where(gpu_bandwidths > mps_bandwidths)[0]) / len(x)
	print("Percent MLX faster than MPS: ", portion_faster * 100)


	if __name__ == "__main__":
	time_fft()