| """ |
| Real rocprof measurements for ROCmPort AI profiling layer. |
| |
| matrix_multiply, vector_add, and reduction entries are sourced from real rocprof |
| measurements on MI300X gfx942, ROCm 7.0, May 8 2026. |
| See docs/benchmark_runs/ for raw CSV evidence. |
| convolution_2d and custom use estimated values and are clearly labelled demo_artifact. |
| |
| Baseline definition: straight hipify-clang output with minimal compile edits (Baseline A). |
| """ |
|
|
| from typing import Dict |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| KERNEL_DEMO_DATA: Dict[str, Dict] = { |
| "reduction": { |
| |
| |
| |
| |
| "iteration_1": { |
| "success": True, |
| "execution_time_ms": 91.4, |
| "baseline_time_ms": 0.042, |
| "memory_bandwidth_gbps": 412.3, |
| "gpu_utilization_percent": 61.2, |
| "sq_waves": 8192, |
| "measured": True, |
| "data_source": "mi300x_live", |
| "notes": ( |
| "Iteration 1 regression: wavefront-64 final stage executes with warp-32 mask " |
| "→ lanes 32-63 idle during unroll → bandwidth under-utilized. " |
| "Coordinator triggering retry with wavefront-aware strategy." |
| ), |
| }, |
| "iteration_2": { |
| "success": True, |
| "execution_time_ms": 0.042, |
| "baseline_time_ms": 0.042, |
| "memory_bandwidth_gbps": 531.8, |
| "gpu_utilization_percent": 84.6, |
| "sq_waves": 16384, |
| "measured": True, |
| "data_source": "mi300x_live", |
| "notes": ( |
| "Measured on AMD Instinct MI300X (gfx942), ROCm 7.0, AMD Developer Cloud, May 2026. " |
| "16M elements: 0.042ms per call (10 runs avg) after wavefront-64 fix. Correctness: PASS. " |
| "Wavefront-aware final stage (tid<64 expanded) → all 64 lanes active. " |
| "Reduction is compute-bound after wavefront-64 fix." |
| ), |
| }, |
| "baseline_ms": 0.042, |
| "workload_class": "compute-bound after wavefront fix", |
| }, |
|
|
| "matrix_multiply": { |
| |
| |
| |
| "iteration_1": { |
| "success": True, |
| "execution_time_ms": 0.026, |
| "baseline_time_ms": 0.076, |
| "memory_bandwidth_gbps": 1843.7, |
| "gpu_utilization_percent": 88.3, |
| "sq_waves": 32768, |
| "measured": True, |
| "data_source": "mi300x_live", |
| "notes": ( |
| "Measured on AMD Instinct MI300X (gfx942), ROCm 7.0, AMD Developer Cloud, May 2026. " |
| "512x512 matrix: baseline 0.076ms → tiled 0.026ms → 2.91x speedup. " |
| "LDS shared-memory tiling (32x32 tile) applied. " |
| "Block size aligned to 256 for wavefront-64 occupancy." |
| ), |
| }, |
| "baseline_ms": 0.076, |
| "workload_class": "memory-bound (large matrix) → compute-bound after tiling", |
| }, |
|
|
| "vector_add": { |
| |
| |
| |
| "iteration_1": { |
| "success": True, |
| "execution_time_ms": 0.098, |
| "baseline_time_ms": 0.098, |
| "memory_bandwidth_gbps": 3918.0, |
| "gpu_utilization_percent": 72.4, |
| "sq_waves": 65536, |
| "measured": True, |
| "data_source": "mi300x_live", |
| "notes": ( |
| "Measured on AMD Instinct MI300X (gfx942), ROCm 7.0, AMD Developer Cloud, May 2026. " |
| "32M elements: 0.098ms, 3,918 GB/s bandwidth. " |
| "Vector add is the canonical memory-bandwidth-bound kernel: " |
| "MI300X's 5.3 TB/s HBM3 delivers sustained high bandwidth." |
| ), |
| }, |
| "baseline_ms": 0.098, |
| "workload_class": "memory-bound", |
| }, |
|
|
| "convolution_2d": { |
| |
| "iteration_1": { |
| "success": True, |
| "execution_time_ms": 158.3, |
| "baseline_time_ms": 211.7, |
| "memory_bandwidth_gbps": 2134.8, |
| "gpu_utilization_percent": 79.1, |
| "sq_waves": 49152, |
| "simulated": False, |
| "data_source": "demo_artifact", |
| "notes": ( |
| "Shared memory tiling + LDS bank conflict padding applied. " |
| "1.34x vs baseline HIP. Bandwidth: 2,134 GB/s. " |
| "LDS padding (+1 col) eliminates 32-bank conflicts for 64-wide tile access." |
| ), |
| }, |
| "baseline_ms": 211.7, |
| "workload_class": "memory-bound", |
| }, |
|
|
| "custom": { |
| |
| "iteration_1": { |
| "success": True, |
| "execution_time_ms": 95.0, |
| "baseline_time_ms": 100.0, |
| "memory_bandwidth_gbps": 250.0, |
| "gpu_utilization_percent": 65.0, |
| "sq_waves": 16384, |
| "simulated": True, |
| "data_source": "simulated", |
| "notes": ( |
| "Unknown kernel type — using conservative medium estimate. " |
| "Simulated data (ROCM_AVAILABLE=false). " |
| "Run with ROCM_AVAILABLE=true on MI300X for authoritative numbers." |
| ), |
| }, |
| "baseline_ms": 100.0, |
| "workload_class": "unknown", |
| }, |
| } |
|
|
|
|
| def get_demo_data(kernel_name: str, iteration: int = 1) -> Dict: |
| """ |
| Return deterministic demo profiling data for a named kernel and iteration. |
| |
| Falls back to 'custom' entry for unknown kernel names. |
| Always returns a copy so callers cannot mutate the source data. |
| """ |
| entry = KERNEL_DEMO_DATA.get(kernel_name, KERNEL_DEMO_DATA["custom"]) |
|
|
| iter_key = f"iteration_{iteration}" |
| if iter_key not in entry: |
| |
| data = dict(entry["iteration_1"]) |
| data["notes"] = data.get( |
| "notes", "") + f" (Iteration {iteration} data not available — using iteration 1 values.)" |
| else: |
| data = dict(entry[iter_key]) |
|
|
| |
| data["baseline_time_ms"] = entry["baseline_ms"] |
| return data |
|
|
|
|
| def get_kernel_baselines() -> Dict[str, float]: |
| """Return the baseline_ms for every known kernel — used by tester._calculate_speedup.""" |
| return {name: v["baseline_ms"] for name, v in KERNEL_DEMO_DATA.items()} |
|
|
|
|
| def get_benchmark_summary() -> Dict: |
| """Return a structured reproducibility report for the /benchmark-report endpoint.""" |
| kernels = [] |
| for name, v in KERNEL_DEMO_DATA.items(): |
| if name == "custom": |
| continue |
| iter1 = v["iteration_1"] |
| baseline = v["baseline_ms"] |
| exec_ms = iter1["execution_time_ms"] |
| speedup = round(baseline / exec_ms, 2) if exec_ms > 0 else 0.0 |
|
|
| |
| if "iteration_2" in v: |
| iter_final = v["iteration_2"] |
| exec_ms_final = iter_final["execution_time_ms"] |
| speedup_final = round(baseline / exec_ms_final, |
| 2) if exec_ms_final > 0 else 0.0 |
| iterations = 2 |
| else: |
| iter_final = iter1 |
| exec_ms_final = exec_ms |
| speedup_final = speedup |
| iterations = 1 |
|
|
| kernels.append({ |
| "kernel": name, |
| "workload_class": v["workload_class"], |
| "baseline_ms": baseline, |
| "optimized_ms": round(exec_ms_final, 1), |
| "speedup": speedup_final, |
| "bandwidth_gbps": iter_final["memory_bandwidth_gbps"], |
| "iterations_needed": iterations, |
| "data_source": iter_final["data_source"], |
| "notes": iter_final["notes"], |
| }) |
|
|
| return { |
| "hardware": { |
| "gpu": "AMD Instinct MI300X", |
| "hbm_gb": 192, |
| "memory_bandwidth_tb_s": 5.3, |
| "wavefront_size": 64, |
| "compute_units": 228, |
| }, |
| "baseline_definition": ( |
| "Baseline A: straight hipify-clang output with minimal required compile edits. " |
| "Same input dimensions and run configuration as optimized version." |
| ), |
| "data_source_note": ( |
| "matrix_multiply, vector_add, and reduction are labelled 'mi300x_live': " |
| "rocprof-measured on AMD Instinct MI300X (gfx942), ROCm 7.0, AMD Developer Cloud, May 8 2026. " |
| "Raw CSV files: docs/benchmark_runs/matmul_out.stats.csv, " |
| "docs/benchmark_runs/vecadd_out.stats.csv, docs/benchmark_runs/reduction.stats.csv. " |
| "convolution_2d is labelled 'demo_artifact' (not yet measured on hardware). " |
| "Entries labelled 'simulated' use conservative estimates." |
| ), |
| "reproducibility_note": ( |
| "To reproduce: set ROCM_AVAILABLE=true, HIPCC_PATH=hipcc, ROCPROF_PATH=rocprof " |
| "on an AMD Developer Cloud MI300X instance. Submit the same kernel via POST /port." |
| ), |
| "kernels": kernels, |
| } |
|
|