Merge branch 'main' of https://github.com/tazwaryayyyy/ROCmPort-AI
Browse files
backend/demo_kernels/flash_attention_simplified.cu
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include <hip/hip_runtime.h>
|
| 2 |
+
#include <math.h>
|
| 3 |
+
#include <stdio.h>
|
| 4 |
+
|
| 5 |
+
#define BLOCK_SIZE 32
|
| 6 |
+
#define HEAD_DIM 64
|
| 7 |
+
|
| 8 |
+
__global__ void flash_attention_forward(
|
| 9 |
+
const float* Q, const float* K, const float* V,
|
| 10 |
+
float* O, float* L,
|
| 11 |
+
int seq_len, int head_dim, float scale
|
| 12 |
+
) {
|
| 13 |
+
extern __shared__ float sram[];
|
| 14 |
+
float* q_tile = sram;
|
| 15 |
+
float* k_tile = sram + BLOCK_SIZE * HEAD_DIM;
|
| 16 |
+
float* v_tile = k_tile + BLOCK_SIZE * HEAD_DIM;
|
| 17 |
+
float* s_tile = v_tile + BLOCK_SIZE * HEAD_DIM;
|
| 18 |
+
|
| 19 |
+
int tid = threadIdx.x;
|
| 20 |
+
int block_row = blockIdx.x;
|
| 21 |
+
|
| 22 |
+
for (int d = tid; d < head_dim; d += BLOCK_SIZE)
|
| 23 |
+
q_tile[tid * HEAD_DIM + d] = Q[block_row * BLOCK_SIZE * head_dim + tid * head_dim + d];
|
| 24 |
+
__syncthreads();
|
| 25 |
+
|
| 26 |
+
float row_max = -1e9f, row_sum = 0.0f;
|
| 27 |
+
float acc[HEAD_DIM];
|
| 28 |
+
for (int d = 0; d < HEAD_DIM; d++) acc[d] = 0.0f;
|
| 29 |
+
|
| 30 |
+
for (int block_col = 0; block_col < (seq_len + BLOCK_SIZE - 1) / BLOCK_SIZE; block_col++) {
|
| 31 |
+
for (int d = tid; d < head_dim; d += BLOCK_SIZE) {
|
| 32 |
+
k_tile[tid * HEAD_DIM + d] = K[block_col * BLOCK_SIZE * head_dim + tid * head_dim + d];
|
| 33 |
+
v_tile[tid * HEAD_DIM + d] = V[block_col * BLOCK_SIZE * head_dim + tid * head_dim + d];
|
| 34 |
+
}
|
| 35 |
+
__syncthreads();
|
| 36 |
+
|
| 37 |
+
for (int j = 0; j < BLOCK_SIZE; j++) {
|
| 38 |
+
float score = 0.0f;
|
| 39 |
+
for (int d = 0; d < head_dim; d++)
|
| 40 |
+
score += q_tile[tid * HEAD_DIM + d] * k_tile[j * HEAD_DIM + d];
|
| 41 |
+
s_tile[tid * BLOCK_SIZE + j] = score * scale;
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
// BUG: 0xffffffff mask assumes 32-lane warp - wrong on AMD wavefront-64
|
| 45 |
+
float thread_max = s_tile[tid * BLOCK_SIZE];
|
| 46 |
+
for (int j = 1; j < BLOCK_SIZE; j++)
|
| 47 |
+
thread_max = fmaxf(thread_max, s_tile[tid * BLOCK_SIZE + j]);
|
| 48 |
+
for (int offset = 16; offset > 0; offset >>= 1)
|
| 49 |
+
thread_max = fmaxf(thread_max, __shfl_down(thread_max, offset));
|
| 50 |
+
float block_max = __shfl(thread_max, 0);
|
| 51 |
+
|
| 52 |
+
float exp_sum = 0.0f;
|
| 53 |
+
for (int j = 0; j < BLOCK_SIZE; j++) {
|
| 54 |
+
s_tile[tid * BLOCK_SIZE + j] = expf(s_tile[tid * BLOCK_SIZE + j] - block_max);
|
| 55 |
+
exp_sum += s_tile[tid * BLOCK_SIZE + j];
|
| 56 |
+
}
|
| 57 |
+
// BUG: offset=16 is half of warp-32, should be 32 for AMD wavefront-64
|
| 58 |
+
for (int offset = 16; offset > 0; offset >>= 1)
|
| 59 |
+
exp_sum += __shfl_down(exp_sum, offset);
|
| 60 |
+
|
| 61 |
+
float new_max = fmaxf(row_max, block_max);
|
| 62 |
+
float correction = expf(row_max - new_max);
|
| 63 |
+
row_sum = correction * row_sum + exp_sum;
|
| 64 |
+
row_max = new_max;
|
| 65 |
+
|
| 66 |
+
for (int d = 0; d < head_dim; d++) {
|
| 67 |
+
float pv = 0.0f;
|
| 68 |
+
for (int j = 0; j < BLOCK_SIZE; j++)
|
| 69 |
+
pv += s_tile[tid * BLOCK_SIZE + j] * v_tile[j * HEAD_DIM + d];
|
| 70 |
+
acc[d] = correction * acc[d] + pv;
|
| 71 |
+
}
|
| 72 |
+
__syncthreads();
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
for (int d = 0; d < head_dim; d++)
|
| 76 |
+
O[block_row * BLOCK_SIZE * head_dim + tid * head_dim + d] = acc[d] / row_sum;
|
| 77 |
+
L[block_row * BLOCK_SIZE + tid] = row_max + logf(row_sum);
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
int main() {
|
| 81 |
+
int seq_len = 128, head_dim = HEAD_DIM;
|
| 82 |
+
float scale = 1.0f / sqrtf((float)head_dim);
|
| 83 |
+
printf("Flash Attention Forward (seq=%d head_dim=%d)\n", seq_len, head_dim);
|
| 84 |
+
printf("AMD-specific bugs: warp-32 shuffle mask, offset=16 for wavefront-64\n");
|
| 85 |
+
size_t sz = seq_len * head_dim * sizeof(float);
|
| 86 |
+
float *d_Q, *d_K, *d_V, *d_O, *d_L;
|
| 87 |
+
hipMalloc(&d_Q, sz); hipMalloc(&d_K, sz); hipMalloc(&d_V, sz);
|
| 88 |
+
hipMalloc(&d_O, sz); hipMalloc(&d_L, seq_len * sizeof(float));
|
| 89 |
+
dim3 grid(seq_len / BLOCK_SIZE), block(BLOCK_SIZE);
|
| 90 |
+
size_t shmem = (3 * BLOCK_SIZE * HEAD_DIM + BLOCK_SIZE * BLOCK_SIZE) * sizeof(float);
|
| 91 |
+
flash_attention_forward<<<grid, block, shmem>>>(d_Q, d_K, d_V, d_O, d_L, seq_len, head_dim, scale);
|
| 92 |
+
hipDeviceSynchronize();
|
| 93 |
+
printf("Done - kernel executed on gfx942\n");
|
| 94 |
+
hipFree(d_Q); hipFree(d_K); hipFree(d_V); hipFree(d_O); hipFree(d_L);
|
| 95 |
+
return 0;
|
| 96 |
+
}
|
docs/FAILURE_CASES.md
CHANGED
|
@@ -56,3 +56,25 @@ cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_item
|
|
| 56 |
**What ROCmPort AI does not do**: guarantee correctness or performance parity for library-heavy code without human validation.
|
| 57 |
|
| 58 |
**Fix requirement**: Manual comparison of CUB vs hipCUB primitive behavior for the specific use case, or replacement with rocPRIM equivalents.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
**What ROCmPort AI does not do**: guarantee correctness or performance parity for library-heavy code without human validation.
|
| 57 |
|
| 58 |
**Fix requirement**: Manual comparison of CUB vs hipCUB primitive behavior for the specific use case, or replacement with rocPRIM equivalents.
|
| 59 |
+
|
| 60 |
+
## Failure Case: Flash Attention — Warp Shuffle Intrinsics
|
| 61 |
+
|
| 62 |
+
**Kernel**: Simplified Flash Attention forward pass (Dao et al. 2022 style)
|
| 63 |
+
**File**: backend/demo_kernels/flash_attention_simplified.cu
|
| 64 |
+
|
| 65 |
+
**Bugs detected by ROCmPort AI static scan**:
|
| 66 |
+
- `__shfl_down` with implicit warp-32 offset=16 — on AMD wavefront-64,
|
| 67 |
+
the final reduction should use offset=32 first
|
| 68 |
+
- Softmax reduction terminates at 16 lanes — silently wrong on gfx942
|
| 69 |
+
|
| 70 |
+
**What hipify does**: renames cudaFree to hipFree, cuda headers to hip headers.
|
| 71 |
+
Does NOT fix the shuffle semantics.
|
| 72 |
+
|
| 73 |
+
**What ROCmPort AI does**: flags both shuffle calls as HIGH risk,
|
| 74 |
+
identifies the offset=16 assumption, suggests wavefront-64 aware rewrite.
|
| 75 |
+
|
| 76 |
+
**Status**: Compiled and executed on AMD Instinct MI300X (gfx942), ROCm 7.2.
|
| 77 |
+
Numerical correctness not verified — requires reference CPU implementation.
|
| 78 |
+
|
| 79 |
+
**Fix required**: Replace `__shfl_down(x, 16)` with two-stage reduction:
|
| 80 |
+
`__shfl_down(x, 32)` then `__shfl_down(x, 16)` for wavefront-64.
|
docs/LIVE_RESULTS.md
CHANGED
|
@@ -1,14 +1,40 @@
|
|
| 1 |
# Live Results — AMD Instinct MI300X (gfx942), ROCm 7.2
|
| 2 |
|
| 3 |
-
All kernels
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
|
| 9 |
-
|
|
| 10 |
-
|
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# Live Results — AMD Instinct MI300X (gfx942), ROCm 7.2
|
| 2 |
|
| 3 |
+
All kernels compiled with `hipcc --offload-arch=gfx942 -O3` and
|
| 4 |
+
benchmarked on real AMD DevCloud hardware. No simulated data.
|
| 5 |
+
|
| 6 |
+
## Benchmark Results
|
| 7 |
+
|
| 8 |
+
| Kernel | Input Size | Baseline HIP (ms) | Optimized HIP (ms) | Speedup | Notes |
|
| 9 |
+
|--------|------------|-------------------|-------------------|---------|-------|
|
| 10 |
+
| matrix_multiply | 512x512 fp32 | 0.068 | 0.026 | **2.61x** | Shared memory tiling |
|
| 11 |
+
| reduction | 16M elements fp32 | — | 0.019 | — | Wavefront-64 fix verified PASS |
|
| 12 |
+
| vector_add | 32M elements fp32 | — | 0.099 | — | 4077.6 GB/s (77% MI300X peak) |
|
| 13 |
+
|
| 14 |
+
## Hardware Configuration
|
| 15 |
+
|
| 16 |
+
- **GPU**: AMD Instinct MI300X VF (gfx942)
|
| 17 |
+
- **VRAM**: 192GB HBM3
|
| 18 |
+
- **Platform**: AMD Developer Cloud (ATL1 region)
|
| 19 |
+
- **ROCm**: 7.2
|
| 20 |
+
- **Compiler**: hipcc (clang++ --offload-arch=gfx942)
|
| 21 |
+
- **data_source**: real_rocm
|
| 22 |
+
|
| 23 |
+
## Key Findings
|
| 24 |
+
|
| 25 |
+
**matrix_multiply**: Shared memory tiling with LDS padding ([32][33]
|
| 26 |
+
to avoid bank conflicts) delivers 2.61x over naive global memory access
|
| 27 |
+
on gfx942. The wavefront-64 aligned block size (256 threads) is critical
|
| 28 |
+
for this result.
|
| 29 |
+
|
| 30 |
+
**reduction**: AMD wavefront-64 aware final stage produces correct results.
|
| 31 |
+
The original CUDA kernel with hardcoded warp-32 assumption silently skips
|
| 32 |
+
lanes 32-63 and returns a wrong sum. ROCmPort AI catches this at static
|
| 33 |
+
scan before any compilation attempt.
|
| 34 |
+
|
| 35 |
+
**vector_add**: 4077.6 GB/s achieved on a memory-bound kernel — 77% of
|
| 36 |
+
MI300X's 5.3 TB/s theoretical HBM3 peak. This demonstrates the bandwidth
|
| 37 |
+
advantage of MI300X over H100 (3.35 TB/s peak) for memory-bound workloads.
|
| 38 |
+
|
| 39 |
+
## Correctness Verification
|
| 40 |
+
All kernels executed without runtime errors on gfx942.
|
docs/benchmark_runs/mi300x_results.txt
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Hardware: AMD Instinct MI300X VF (gfx942)
|
| 2 |
+
ROCm: 7.2
|
| 3 |
+
Date: 2025-05-06
|
| 4 |
+
Compiler: hipcc --offload-arch=gfx942 -O3
|
| 5 |
+
|
| 6 |
+
matrix_multiply (512x512 fp32):
|
| 7 |
+
Basic kernel: 0.068 ms
|
| 8 |
+
Shared memory kernel: 0.026 ms
|
| 9 |
+
Speedup: 2.61x
|
| 10 |
+
|
| 11 |
+
reduction (16M elements fp32):
|
| 12 |
+
Kernel time: 0.019 ms
|
| 13 |
+
Correctness: PASS (16777216 == 16777216)
|
| 14 |
+
|
| 15 |
+
vector_add (32M elements fp32):
|
| 16 |
+
Kernel time: 0.099 ms
|
| 17 |
+
Memory bandwidth: 4077.6 GB/s (77% of MI300X peak 5.3 TB/s)
|