tazwarrrr commited on
Commit
d61bf98
·
2 Parent(s): 485813e212db16

Merge branch 'main' of https://github.com/tazwaryayyyy/ROCmPort-AI

Browse files
backend/demo_kernels/flash_attention_simplified.cu ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <hip/hip_runtime.h>
2
+ #include <math.h>
3
+ #include <stdio.h>
4
+
5
+ #define BLOCK_SIZE 32
6
+ #define HEAD_DIM 64
7
+
8
+ __global__ void flash_attention_forward(
9
+ const float* Q, const float* K, const float* V,
10
+ float* O, float* L,
11
+ int seq_len, int head_dim, float scale
12
+ ) {
13
+ extern __shared__ float sram[];
14
+ float* q_tile = sram;
15
+ float* k_tile = sram + BLOCK_SIZE * HEAD_DIM;
16
+ float* v_tile = k_tile + BLOCK_SIZE * HEAD_DIM;
17
+ float* s_tile = v_tile + BLOCK_SIZE * HEAD_DIM;
18
+
19
+ int tid = threadIdx.x;
20
+ int block_row = blockIdx.x;
21
+
22
+ for (int d = tid; d < head_dim; d += BLOCK_SIZE)
23
+ q_tile[tid * HEAD_DIM + d] = Q[block_row * BLOCK_SIZE * head_dim + tid * head_dim + d];
24
+ __syncthreads();
25
+
26
+ float row_max = -1e9f, row_sum = 0.0f;
27
+ float acc[HEAD_DIM];
28
+ for (int d = 0; d < HEAD_DIM; d++) acc[d] = 0.0f;
29
+
30
+ for (int block_col = 0; block_col < (seq_len + BLOCK_SIZE - 1) / BLOCK_SIZE; block_col++) {
31
+ for (int d = tid; d < head_dim; d += BLOCK_SIZE) {
32
+ k_tile[tid * HEAD_DIM + d] = K[block_col * BLOCK_SIZE * head_dim + tid * head_dim + d];
33
+ v_tile[tid * HEAD_DIM + d] = V[block_col * BLOCK_SIZE * head_dim + tid * head_dim + d];
34
+ }
35
+ __syncthreads();
36
+
37
+ for (int j = 0; j < BLOCK_SIZE; j++) {
38
+ float score = 0.0f;
39
+ for (int d = 0; d < head_dim; d++)
40
+ score += q_tile[tid * HEAD_DIM + d] * k_tile[j * HEAD_DIM + d];
41
+ s_tile[tid * BLOCK_SIZE + j] = score * scale;
42
+ }
43
+
44
+ // BUG: 0xffffffff mask assumes 32-lane warp - wrong on AMD wavefront-64
45
+ float thread_max = s_tile[tid * BLOCK_SIZE];
46
+ for (int j = 1; j < BLOCK_SIZE; j++)
47
+ thread_max = fmaxf(thread_max, s_tile[tid * BLOCK_SIZE + j]);
48
+ for (int offset = 16; offset > 0; offset >>= 1)
49
+ thread_max = fmaxf(thread_max, __shfl_down(thread_max, offset));
50
+ float block_max = __shfl(thread_max, 0);
51
+
52
+ float exp_sum = 0.0f;
53
+ for (int j = 0; j < BLOCK_SIZE; j++) {
54
+ s_tile[tid * BLOCK_SIZE + j] = expf(s_tile[tid * BLOCK_SIZE + j] - block_max);
55
+ exp_sum += s_tile[tid * BLOCK_SIZE + j];
56
+ }
57
+ // BUG: offset=16 is half of warp-32, should be 32 for AMD wavefront-64
58
+ for (int offset = 16; offset > 0; offset >>= 1)
59
+ exp_sum += __shfl_down(exp_sum, offset);
60
+
61
+ float new_max = fmaxf(row_max, block_max);
62
+ float correction = expf(row_max - new_max);
63
+ row_sum = correction * row_sum + exp_sum;
64
+ row_max = new_max;
65
+
66
+ for (int d = 0; d < head_dim; d++) {
67
+ float pv = 0.0f;
68
+ for (int j = 0; j < BLOCK_SIZE; j++)
69
+ pv += s_tile[tid * BLOCK_SIZE + j] * v_tile[j * HEAD_DIM + d];
70
+ acc[d] = correction * acc[d] + pv;
71
+ }
72
+ __syncthreads();
73
+ }
74
+
75
+ for (int d = 0; d < head_dim; d++)
76
+ O[block_row * BLOCK_SIZE * head_dim + tid * head_dim + d] = acc[d] / row_sum;
77
+ L[block_row * BLOCK_SIZE + tid] = row_max + logf(row_sum);
78
+ }
79
+
80
+ int main() {
81
+ int seq_len = 128, head_dim = HEAD_DIM;
82
+ float scale = 1.0f / sqrtf((float)head_dim);
83
+ printf("Flash Attention Forward (seq=%d head_dim=%d)\n", seq_len, head_dim);
84
+ printf("AMD-specific bugs: warp-32 shuffle mask, offset=16 for wavefront-64\n");
85
+ size_t sz = seq_len * head_dim * sizeof(float);
86
+ float *d_Q, *d_K, *d_V, *d_O, *d_L;
87
+ hipMalloc(&d_Q, sz); hipMalloc(&d_K, sz); hipMalloc(&d_V, sz);
88
+ hipMalloc(&d_O, sz); hipMalloc(&d_L, seq_len * sizeof(float));
89
+ dim3 grid(seq_len / BLOCK_SIZE), block(BLOCK_SIZE);
90
+ size_t shmem = (3 * BLOCK_SIZE * HEAD_DIM + BLOCK_SIZE * BLOCK_SIZE) * sizeof(float);
91
+ flash_attention_forward<<<grid, block, shmem>>>(d_Q, d_K, d_V, d_O, d_L, seq_len, head_dim, scale);
92
+ hipDeviceSynchronize();
93
+ printf("Done - kernel executed on gfx942\n");
94
+ hipFree(d_Q); hipFree(d_K); hipFree(d_V); hipFree(d_O); hipFree(d_L);
95
+ return 0;
96
+ }
docs/FAILURE_CASES.md CHANGED
@@ -56,3 +56,25 @@ cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_item
56
  **What ROCmPort AI does not do**: guarantee correctness or performance parity for library-heavy code without human validation.
57
 
58
  **Fix requirement**: Manual comparison of CUB vs hipCUB primitive behavior for the specific use case, or replacement with rocPRIM equivalents.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  **What ROCmPort AI does not do**: guarantee correctness or performance parity for library-heavy code without human validation.
57
 
58
  **Fix requirement**: Manual comparison of CUB vs hipCUB primitive behavior for the specific use case, or replacement with rocPRIM equivalents.
59
+
60
+ ## Failure Case: Flash Attention — Warp Shuffle Intrinsics
61
+
62
+ **Kernel**: Simplified Flash Attention forward pass (Dao et al. 2022 style)
63
+ **File**: backend/demo_kernels/flash_attention_simplified.cu
64
+
65
+ **Bugs detected by ROCmPort AI static scan**:
66
+ - `__shfl_down` with implicit warp-32 offset=16 — on AMD wavefront-64,
67
+ the final reduction should use offset=32 first
68
+ - Softmax reduction terminates at 16 lanes — silently wrong on gfx942
69
+
70
+ **What hipify does**: renames cudaFree to hipFree, cuda headers to hip headers.
71
+ Does NOT fix the shuffle semantics.
72
+
73
+ **What ROCmPort AI does**: flags both shuffle calls as HIGH risk,
74
+ identifies the offset=16 assumption, suggests wavefront-64 aware rewrite.
75
+
76
+ **Status**: Compiled and executed on AMD Instinct MI300X (gfx942), ROCm 7.2.
77
+ Numerical correctness not verified — requires reference CPU implementation.
78
+
79
+ **Fix required**: Replace `__shfl_down(x, 16)` with two-stage reduction:
80
+ `__shfl_down(x, 32)` then `__shfl_down(x, 16)` for wavefront-64.
docs/LIVE_RESULTS.md CHANGED
@@ -1,14 +1,40 @@
1
  # Live Results — AMD Instinct MI300X (gfx942), ROCm 7.2
2
 
3
- All kernels migrated and compiled successfully on real MI300X hardware.
4
-
5
- | Kernel | CUDA Changes | LLM Fixes | Critical Bugs Found | Compiled on MI300X |
6
- |--------|-------------|-----------|--------------------|--------------------|
7
- | reduction | 7 hipify | 2 LLM | warp-32 final stage (silent wrong results on AMD) | ✅ |
8
- | vector_add | 5 hipify | 2 LLM | threadIdx%32 wavefront mismatch | |
9
- | matrix_multiply | 10 hipify | 1 LLM | warp-32 + LDS bank conflicts ||
10
- | convolution_2d | 10 hipify | 3 LLM | warp-32 + LDS padding | |
11
-
12
- Hardware: AMD Instinct MI300X VF (gfx942), 192GB HBM3
13
- Software: ROCm 7.2, hipcc, rocprof
14
- data_source: real_rocm (not mock)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # Live Results — AMD Instinct MI300X (gfx942), ROCm 7.2
2
 
3
+ All kernels compiled with `hipcc --offload-arch=gfx942 -O3` and
4
+ benchmarked on real AMD DevCloud hardware. No simulated data.
5
+
6
+ ## Benchmark Results
7
+
8
+ | Kernel | Input Size | Baseline HIP (ms) | Optimized HIP (ms) | Speedup | Notes |
9
+ |--------|------------|-------------------|-------------------|---------|-------|
10
+ | matrix_multiply | 512x512 fp32 | 0.068 | 0.026 | **2.61x** | Shared memory tiling |
11
+ | reduction | 16M elements fp32 | — | 0.019 | — | Wavefront-64 fix verified PASS |
12
+ | vector_add | 32M elements fp32 | — | 0.099 | — | 4077.6 GB/s (77% MI300X peak) |
13
+
14
+ ## Hardware Configuration
15
+
16
+ - **GPU**: AMD Instinct MI300X VF (gfx942)
17
+ - **VRAM**: 192GB HBM3
18
+ - **Platform**: AMD Developer Cloud (ATL1 region)
19
+ - **ROCm**: 7.2
20
+ - **Compiler**: hipcc (clang++ --offload-arch=gfx942)
21
+ - **data_source**: real_rocm
22
+
23
+ ## Key Findings
24
+
25
+ **matrix_multiply**: Shared memory tiling with LDS padding ([32][33]
26
+ to avoid bank conflicts) delivers 2.61x over naive global memory access
27
+ on gfx942. The wavefront-64 aligned block size (256 threads) is critical
28
+ for this result.
29
+
30
+ **reduction**: AMD wavefront-64 aware final stage produces correct results.
31
+ The original CUDA kernel with hardcoded warp-32 assumption silently skips
32
+ lanes 32-63 and returns a wrong sum. ROCmPort AI catches this at static
33
+ scan before any compilation attempt.
34
+
35
+ **vector_add**: 4077.6 GB/s achieved on a memory-bound kernel — 77% of
36
+ MI300X's 5.3 TB/s theoretical HBM3 peak. This demonstrates the bandwidth
37
+ advantage of MI300X over H100 (3.35 TB/s peak) for memory-bound workloads.
38
+
39
+ ## Correctness Verification
40
+ All kernels executed without runtime errors on gfx942.
docs/benchmark_runs/mi300x_results.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Hardware: AMD Instinct MI300X VF (gfx942)
2
+ ROCm: 7.2
3
+ Date: 2025-05-06
4
+ Compiler: hipcc --offload-arch=gfx942 -O3
5
+
6
+ matrix_multiply (512x512 fp32):
7
+ Basic kernel: 0.068 ms
8
+ Shared memory kernel: 0.026 ms
9
+ Speedup: 2.61x
10
+
11
+ reduction (16M elements fp32):
12
+ Kernel time: 0.019 ms
13
+ Correctness: PASS (16777216 == 16777216)
14
+
15
+ vector_add (32M elements fp32):
16
+ Kernel time: 0.099 ms
17
+ Memory bandwidth: 4077.6 GB/s (77% of MI300X peak 5.3 TB/s)