oktoblast / examples /python /pytorch_integration.py

Upload 20 files

8046910 verified about 1 month ago

3.17 kB

	"""
	OktoBLAS - PyTorch Integration Example
	======================================

	This example demonstrates how to use OktoBLAS with PyTorch.

	Installation:
	pip install oktoblas torch

	"""

	import oktoblas as ob
	import numpy as np
	import time

	def main():
	print("=" * 60)
	print("OktoBLAS + PyTorch Integration")
	print("=" * 60)

	try:
	import torch
	print(f"\nPyTorch version: {torch.__version__}")
	print(f"CUDA available: {torch.cuda.is_available()}")
	if torch.cuda.is_available():
	print(f"GPU: {torch.cuda.get_device_name()}")
	except ImportError:
	print("PyTorch not installed. Install with: pip install torch")
	return

	# Benchmark comparison
	print("\n" + "-" * 60)
	print("FP16 GEMM Benchmark (2048x2048)")
	print("-" * 60)

	size = 2048
	iterations = 100

	# Prepare data
	A_np = np.random.randn(size, size).astype(np.float16)
	B_np = np.random.randn(size, size).astype(np.float16)

	# PyTorch benchmark
	if torch.cuda.is_available():
	A_torch = torch.from_numpy(A_np).cuda()
	B_torch = torch.from_numpy(B_np).cuda()

	# Warmup
	for _ in range(10):
	_ = torch.matmul(A_torch, B_torch)
	torch.cuda.synchronize()

	# Benchmark
	start = time.perf_counter()
	for _ in range(iterations):
	C_torch = torch.matmul(A_torch, B_torch)
	torch.cuda.synchronize()
	pytorch_time = (time.perf_counter() - start) / iterations * 1000 # ms

	flops = 2 * size * size * size
	pytorch_tflops = flops / (pytorch_time / 1000) / 1e12
	print(f"PyTorch: {pytorch_time:.3f} ms ({pytorch_tflops:.1f} TFLOPS)")

	# OktoBLAS benchmark
	# Warmup
	for _ in range(10):
	_ = ob.matmul_fp16(A_np, B_np)

	# Benchmark
	start = time.perf_counter()
	for _ in range(iterations):
	C_ob = ob.matmul_fp16(A_np, B_np)
	oktoblas_time = (time.perf_counter() - start) / iterations * 1000 # ms

	oktoblas_tflops = flops / (oktoblas_time / 1000) / 1e12
	print(f"OktoBLAS: {oktoblas_time:.3f} ms ({oktoblas_tflops:.1f} TFLOPS)")

	if torch.cuda.is_available():
	ratio = oktoblas_tflops / pytorch_tflops * 100
	print(f"\nRatio: {ratio:.1f}% of PyTorch")
	if ratio > 100:
	print("🏆 OktoBLAS WINS!")

	# Verify correctness
	print("\n" + "-" * 60)
	print("Correctness Check")
	print("-" * 60)

	# Small matrix for verification
	A_small = np.random.randn(64, 64).astype(np.float32)
	B_small = np.random.randn(64, 64).astype(np.float32)

	C_numpy = np.matmul(A_small, B_small)
	C_oktoblas = ob.matmul(A_small, B_small)

	diff = np.abs(C_numpy - C_oktoblas).max()
	print(f"Max difference from NumPy: {diff:.6f}")
	print(f"Correctness: {'✅ PASS' if diff < 0.01 else '❌ FAIL'}")

	print("\n" + "=" * 60)
	print("Done!")
	print("=" * 60)

	if __name__ == "__main__":
	main()