File size: 3,173 Bytes
8046910
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
"""

OktoBLAS - PyTorch Integration Example

======================================



This example demonstrates how to use OktoBLAS with PyTorch.



Installation:

    pip install oktoblas torch



"""

import oktoblas as ob
import numpy as np
import time

def main():
    print("=" * 60)
    print("OktoBLAS + PyTorch Integration")
    print("=" * 60)
    
    try:
        import torch
        print(f"\nPyTorch version: {torch.__version__}")
        print(f"CUDA available: {torch.cuda.is_available()}")
        if torch.cuda.is_available():
            print(f"GPU: {torch.cuda.get_device_name()}")
    except ImportError:
        print("PyTorch not installed. Install with: pip install torch")
        return
    
    # Benchmark comparison
    print("\n" + "-" * 60)
    print("FP16 GEMM Benchmark (2048x2048)")
    print("-" * 60)
    
    size = 2048
    iterations = 100
    
    # Prepare data
    A_np = np.random.randn(size, size).astype(np.float16)
    B_np = np.random.randn(size, size).astype(np.float16)
    
    # PyTorch benchmark
    if torch.cuda.is_available():
        A_torch = torch.from_numpy(A_np).cuda()
        B_torch = torch.from_numpy(B_np).cuda()
        
        # Warmup
        for _ in range(10):
            _ = torch.matmul(A_torch, B_torch)
        torch.cuda.synchronize()
        
        # Benchmark
        start = time.perf_counter()
        for _ in range(iterations):
            C_torch = torch.matmul(A_torch, B_torch)
        torch.cuda.synchronize()
        pytorch_time = (time.perf_counter() - start) / iterations * 1000  # ms
        
        flops = 2 * size * size * size
        pytorch_tflops = flops / (pytorch_time / 1000) / 1e12
        print(f"PyTorch:  {pytorch_time:.3f} ms ({pytorch_tflops:.1f} TFLOPS)")
    
    # OktoBLAS benchmark
    # Warmup
    for _ in range(10):
        _ = ob.matmul_fp16(A_np, B_np)
    
    # Benchmark
    start = time.perf_counter()
    for _ in range(iterations):
        C_ob = ob.matmul_fp16(A_np, B_np)
    oktoblas_time = (time.perf_counter() - start) / iterations * 1000  # ms
    
    oktoblas_tflops = flops / (oktoblas_time / 1000) / 1e12
    print(f"OktoBLAS: {oktoblas_time:.3f} ms ({oktoblas_tflops:.1f} TFLOPS)")
    
    if torch.cuda.is_available():
        ratio = oktoblas_tflops / pytorch_tflops * 100
        print(f"\nRatio: {ratio:.1f}% of PyTorch")
        if ratio > 100:
            print("🏆 OktoBLAS WINS!")
    
    # Verify correctness
    print("\n" + "-" * 60)
    print("Correctness Check")
    print("-" * 60)
    
    # Small matrix for verification
    A_small = np.random.randn(64, 64).astype(np.float32)
    B_small = np.random.randn(64, 64).astype(np.float32)
    
    C_numpy = np.matmul(A_small, B_small)
    C_oktoblas = ob.matmul(A_small, B_small)
    
    diff = np.abs(C_numpy - C_oktoblas).max()
    print(f"Max difference from NumPy: {diff:.6f}")
    print(f"Correctness: {'✅ PASS' if diff < 0.01 else '❌ FAIL'}")
    
    print("\n" + "=" * 60)
    print("Done!")
    print("=" * 60)

if __name__ == "__main__":
    main()