File size: 3,173 Bytes
8046910 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
"""
OktoBLAS - PyTorch Integration Example
======================================
This example demonstrates how to use OktoBLAS with PyTorch.
Installation:
pip install oktoblas torch
"""
import oktoblas as ob
import numpy as np
import time
def main():
print("=" * 60)
print("OktoBLAS + PyTorch Integration")
print("=" * 60)
try:
import torch
print(f"\nPyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
print(f"GPU: {torch.cuda.get_device_name()}")
except ImportError:
print("PyTorch not installed. Install with: pip install torch")
return
# Benchmark comparison
print("\n" + "-" * 60)
print("FP16 GEMM Benchmark (2048x2048)")
print("-" * 60)
size = 2048
iterations = 100
# Prepare data
A_np = np.random.randn(size, size).astype(np.float16)
B_np = np.random.randn(size, size).astype(np.float16)
# PyTorch benchmark
if torch.cuda.is_available():
A_torch = torch.from_numpy(A_np).cuda()
B_torch = torch.from_numpy(B_np).cuda()
# Warmup
for _ in range(10):
_ = torch.matmul(A_torch, B_torch)
torch.cuda.synchronize()
# Benchmark
start = time.perf_counter()
for _ in range(iterations):
C_torch = torch.matmul(A_torch, B_torch)
torch.cuda.synchronize()
pytorch_time = (time.perf_counter() - start) / iterations * 1000 # ms
flops = 2 * size * size * size
pytorch_tflops = flops / (pytorch_time / 1000) / 1e12
print(f"PyTorch: {pytorch_time:.3f} ms ({pytorch_tflops:.1f} TFLOPS)")
# OktoBLAS benchmark
# Warmup
for _ in range(10):
_ = ob.matmul_fp16(A_np, B_np)
# Benchmark
start = time.perf_counter()
for _ in range(iterations):
C_ob = ob.matmul_fp16(A_np, B_np)
oktoblas_time = (time.perf_counter() - start) / iterations * 1000 # ms
oktoblas_tflops = flops / (oktoblas_time / 1000) / 1e12
print(f"OktoBLAS: {oktoblas_time:.3f} ms ({oktoblas_tflops:.1f} TFLOPS)")
if torch.cuda.is_available():
ratio = oktoblas_tflops / pytorch_tflops * 100
print(f"\nRatio: {ratio:.1f}% of PyTorch")
if ratio > 100:
print("🏆 OktoBLAS WINS!")
# Verify correctness
print("\n" + "-" * 60)
print("Correctness Check")
print("-" * 60)
# Small matrix for verification
A_small = np.random.randn(64, 64).astype(np.float32)
B_small = np.random.randn(64, 64).astype(np.float32)
C_numpy = np.matmul(A_small, B_small)
C_oktoblas = ob.matmul(A_small, B_small)
diff = np.abs(C_numpy - C_oktoblas).max()
print(f"Max difference from NumPy: {diff:.6f}")
print(f"Correctness: {'✅ PASS' if diff < 0.01 else '❌ FAIL'}")
print("\n" + "=" * 60)
print("Done!")
print("=" * 60)
if __name__ == "__main__":
main()
|