Spaces:

lablab-ai-amd-developer-hackathon
/

ROCmPort-AI

Running

App Files Files Community

tazwarrrr commited on 24 days ago

Commit

d61bf98

2 Parent(s): 485813e 212db16

Merge branch 'main' of https://github.com/tazwaryayyyy/ROCmPort-AI

Browse files

Files changed (4) hide show

backend/demo_kernels/flash_attention_simplified.cu +96 -0
docs/FAILURE_CASES.md +22 -0
docs/LIVE_RESULTS.md +38 -12
docs/benchmark_runs/mi300x_results.txt +17 -0

backend/demo_kernels/flash_attention_simplified.cu ADDED Viewed

	@@ -0,0 +1,96 @@

+#include <hip/hip_runtime.h>
+#include <math.h>
+#include <stdio.h>
+#define BLOCK_SIZE 32
+#define HEAD_DIM 64
+__global__ void flash_attention_forward(
+    const float* Q, const float* K, const float* V,
+    float* O, float* L,
+    int seq_len, int head_dim, float scale
+) {
+    extern __shared__ float sram[];
+    float* q_tile = sram;
+    float* k_tile = sram + BLOCK_SIZE * HEAD_DIM;
+    float* v_tile = k_tile + BLOCK_SIZE * HEAD_DIM;
+    float* s_tile = v_tile + BLOCK_SIZE * HEAD_DIM;
+    int tid = threadIdx.x;
+    int block_row = blockIdx.x;
+    for (int d = tid; d < head_dim; d += BLOCK_SIZE)
+        q_tile[tid * HEAD_DIM + d] = Q[block_row * BLOCK_SIZE * head_dim + tid * head_dim + d];
+    __syncthreads();
+    float row_max = -1e9f, row_sum = 0.0f;
+    float acc[HEAD_DIM];
+    for (int d = 0; d < HEAD_DIM; d++) acc[d] = 0.0f;
+    for (int block_col = 0; block_col < (seq_len + BLOCK_SIZE - 1) / BLOCK_SIZE; block_col++) {
+        for (int d = tid; d < head_dim; d += BLOCK_SIZE) {
+            k_tile[tid * HEAD_DIM + d] = K[block_col * BLOCK_SIZE * head_dim + tid * head_dim + d];
+            v_tile[tid * HEAD_DIM + d] = V[block_col * BLOCK_SIZE * head_dim + tid * head_dim + d];
+        }
+        __syncthreads();
+        for (int j = 0; j < BLOCK_SIZE; j++) {
+            float score = 0.0f;
+            for (int d = 0; d < head_dim; d++)
+                score += q_tile[tid * HEAD_DIM + d] * k_tile[j * HEAD_DIM + d];
+            s_tile[tid * BLOCK_SIZE + j] = score * scale;
+        }
+        // BUG: 0xffffffff mask assumes 32-lane warp - wrong on AMD wavefront-64
+        float thread_max = s_tile[tid * BLOCK_SIZE];
+        for (int j = 1; j < BLOCK_SIZE; j++)
+            thread_max = fmaxf(thread_max, s_tile[tid * BLOCK_SIZE + j]);
+        for (int offset = 16; offset > 0; offset >>= 1)
+            thread_max = fmaxf(thread_max, __shfl_down(thread_max, offset));
+        float block_max = __shfl(thread_max, 0);
+        float exp_sum = 0.0f;
+        for (int j = 0; j < BLOCK_SIZE; j++) {
+            s_tile[tid * BLOCK_SIZE + j] = expf(s_tile[tid * BLOCK_SIZE + j] - block_max);
+            exp_sum += s_tile[tid * BLOCK_SIZE + j];
+        }
+        // BUG: offset=16 is half of warp-32, should be 32 for AMD wavefront-64
+        for (int offset = 16; offset > 0; offset >>= 1)
+            exp_sum += __shfl_down(exp_sum, offset);
+        float new_max = fmaxf(row_max, block_max);
+        float correction = expf(row_max - new_max);
+        row_sum = correction * row_sum + exp_sum;
+        row_max = new_max;
+        for (int d = 0; d < head_dim; d++) {
+            float pv = 0.0f;
+            for (int j = 0; j < BLOCK_SIZE; j++)
+                pv += s_tile[tid * BLOCK_SIZE + j] * v_tile[j * HEAD_DIM + d];
+            acc[d] = correction * acc[d] + pv;
+        }
+        __syncthreads();
+    }
+    for (int d = 0; d < head_dim; d++)
+        O[block_row * BLOCK_SIZE * head_dim + tid * head_dim + d] = acc[d] / row_sum;
+    L[block_row * BLOCK_SIZE + tid] = row_max + logf(row_sum);
+}
+int main() {
+    int seq_len = 128, head_dim = HEAD_DIM;
+    float scale = 1.0f / sqrtf((float)head_dim);
+    printf("Flash Attention Forward (seq=%d head_dim=%d)\n", seq_len, head_dim);
+    printf("AMD-specific bugs: warp-32 shuffle mask, offset=16 for wavefront-64\n");
+    size_t sz = seq_len * head_dim * sizeof(float);
+    float *d_Q, *d_K, *d_V, *d_O, *d_L;
+    hipMalloc(&d_Q, sz); hipMalloc(&d_K, sz); hipMalloc(&d_V, sz);
+    hipMalloc(&d_O, sz); hipMalloc(&d_L, seq_len * sizeof(float));
+    dim3 grid(seq_len / BLOCK_SIZE), block(BLOCK_SIZE);
+    size_t shmem = (3 * BLOCK_SIZE * HEAD_DIM + BLOCK_SIZE * BLOCK_SIZE) * sizeof(float);
+    flash_attention_forward<<<grid, block, shmem>>>(d_Q, d_K, d_V, d_O, d_L, seq_len, head_dim, scale);
+    hipDeviceSynchronize();
+    printf("Done - kernel executed on gfx942\n");
+    hipFree(d_Q); hipFree(d_K); hipFree(d_V); hipFree(d_O); hipFree(d_L);
+    return 0;
+}

docs/FAILURE_CASES.md CHANGED Viewed

@@ -56,3 +56,25 @@ cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_item
 **What ROCmPort AI does not do**: guarantee correctness or performance parity for library-heavy code without human validation.
 **Fix requirement**: Manual comparison of CUB vs hipCUB primitive behavior for the specific use case, or replacement with rocPRIM equivalents.

 **What ROCmPort AI does not do**: guarantee correctness or performance parity for library-heavy code without human validation.
 **Fix requirement**: Manual comparison of CUB vs hipCUB primitive behavior for the specific use case, or replacement with rocPRIM equivalents.
+## Failure Case: Flash Attention — Warp Shuffle Intrinsics
+**Kernel**: Simplified Flash Attention forward pass (Dao et al. 2022 style)
+**File**: backend/demo_kernels/flash_attention_simplified.cu
+**Bugs detected by ROCmPort AI static scan**:
+- `__shfl_down` with implicit warp-32 offset=16 — on AMD wavefront-64,
+  the final reduction should use offset=32 first
+- Softmax reduction terminates at 16 lanes — silently wrong on gfx942
+**What hipify does**: renames cudaFree to hipFree, cuda headers to hip headers.
+Does NOT fix the shuffle semantics.
+**What ROCmPort AI does**: flags both shuffle calls as HIGH risk,
+identifies the offset=16 assumption, suggests wavefront-64 aware rewrite.
+**Status**: Compiled and executed on AMD Instinct MI300X (gfx942), ROCm 7.2.
+Numerical correctness not verified — requires reference CPU implementation.
+**Fix required**: Replace `__shfl_down(x, 16)` with two-stage reduction:
+  `__shfl_down(x, 32)` then `__shfl_down(x, 16)` for wavefront-64.

docs/LIVE_RESULTS.md CHANGED Viewed

@@ -1,14 +1,40 @@
 # Live Results — AMD Instinct MI300X (gfx942), ROCm 7.2
-All kernels migrated and compiled successfully on real MI300X hardware.
-| Kernel | CUDA Changes | LLM Fixes | Critical Bugs Found | Compiled on MI300X |
-|--------|-------------|-----------|--------------------|--------------------|
-| reduction | 7 hipify | 2 LLM | warp-32 final stage (silent wrong results on AMD) | ✅ |
-| vector_add | 5 hipify | 2 LLM | threadIdx%32 wavefront mismatch | ✅ |
-| matrix_multiply | 10 hipify | 1 LLM | warp-32 + LDS bank conflicts | ✅ |
-| convolution_2d | 10 hipify | 3 LLM | warp-32 + LDS padding | ✅ |
-Hardware: AMD Instinct MI300X VF (gfx942), 192GB HBM3
-Software: ROCm 7.2, hipcc, rocprof
-data_source: real_rocm (not mock)

 # Live Results — AMD Instinct MI300X (gfx942), ROCm 7.2
+All kernels compiled with `hipcc --offload-arch=gfx942 -O3` and
+benchmarked on real AMD DevCloud hardware. No simulated data.
+## Benchmark Results
+| Kernel | Input Size | Baseline HIP (ms) | Optimized HIP (ms) | Speedup | Notes |
+|--------|------------|-------------------|-------------------|---------|-------|
+| matrix_multiply | 512x512 fp32 | 0.068 | 0.026 | **2.61x** | Shared memory tiling |
+| reduction | 16M elements fp32 | — | 0.019 | — | Wavefront-64 fix verified PASS |
+| vector_add | 32M elements fp32 | — | 0.099 | — | 4077.6 GB/s (77% MI300X peak) |
+## Hardware Configuration
+- **GPU**: AMD Instinct MI300X VF (gfx942)
+- **VRAM**: 192GB HBM3
+- **Platform**: AMD Developer Cloud (ATL1 region)
+- **ROCm**: 7.2
+- **Compiler**: hipcc (clang++ --offload-arch=gfx942)
+- **data_source**: real_rocm
+## Key Findings
+**matrix_multiply**: Shared memory tiling with LDS padding ([32][33]
+to avoid bank conflicts) delivers 2.61x over naive global memory access
+on gfx942. The wavefront-64 aligned block size (256 threads) is critical
+for this result.
+**reduction**: AMD wavefront-64 aware final stage produces correct results.
+The original CUDA kernel with hardcoded warp-32 assumption silently skips
+lanes 32-63 and returns a wrong sum. ROCmPort AI catches this at static
+scan before any compilation attempt.
+**vector_add**: 4077.6 GB/s achieved on a memory-bound kernel — 77% of
+MI300X's 5.3 TB/s theoretical HBM3 peak. This demonstrates the bandwidth
+advantage of MI300X over H100 (3.35 TB/s peak) for memory-bound workloads.
+## Correctness Verification
+All kernels executed without runtime errors on gfx942.

docs/benchmark_runs/mi300x_results.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+Hardware: AMD Instinct MI300X VF (gfx942)
+ROCm: 7.2
+Date: 2025-05-06
+Compiler: hipcc --offload-arch=gfx942 -O3
+matrix_multiply (512x512 fp32):
+  Basic kernel:        0.068 ms
+  Shared memory kernel: 0.026 ms
+  Speedup:             2.61x
+reduction (16M elements fp32):
+  Kernel time:         0.019 ms
+  Correctness:         PASS (16777216 == 16777216)
+vector_add (32M elements fp32):
+  Kernel time:         0.099 ms
+  Memory bandwidth:    4077.6 GB/s (77% of MI300X peak 5.3 TB/s)