Spaces:

lablab-ai-amd-developer-hackathon
/

ROCmPort-AI

Running

App Files Files Community

ROCmPort-AI / backend /tools /rocprof_wrapper.py

tazwarrrr

fix: priority 1-4 debug pass — retry loop, SSE timeout, rocprof CSV parser, silent failures

0b5416e 8 days ago

raw

history blame contribute delete

8.07 kB

	import subprocess
	import tempfile
	import os
	import re
	from typing import Dict, List, Tuple


	class RocprofWrapper:
	"""Wrapper for AMD rocprof profiler and hipcc compiler"""

	def __init__(self):
	self.rocm_available = os.getenv(
	"ROCM_AVAILABLE", "false").lower() == "true"
	self.hipcc_path = os.getenv("HIPCC_PATH", "hipcc")
	self.rocprof_path = os.getenv("ROCPROF_PATH", "rocprof")

	def compile_hip_code(self, hip_code: str, output_file: str = None) -> Tuple[bool, str]:
	"""Compile HIP code using hipcc"""
	if not self.rocm_available:
	return True, "Mock compilation successful (ROCm not available)"

	temp_file = None
	try:
	with tempfile.NamedTemporaryFile(mode='w', suffix='.hip', delete=False) as f:
	f.write(hip_code)
	temp_file = f.name

	if output_file is None:
	output_file = temp_file.replace('.hip', '.out')

	# Add and --offload-arch=gfx942 to solve "Cannot find libdevice for sm_52" error
	# This ensures compilation works even if CUDA device libraries are missing.
	cmd = [self.hipcc_path, '-o', output_file,
	temp_file, '--offload-arch=gfx942']

	# Set environment variable just in case hipcc invokes nvcc internally
	env = os.environ.copy()
	env['NVCC_APPEND_FLAGS'] = ' --offload-arch=gfx942'

	result = subprocess.run(
	cmd, capture_output=True, text=True, timeout=60, env=env, check=False)

	if result.returncode == 0:
	return True, f"Compilation successful: {output_file}"
	else:
	return False, f"Compilation failed: {result.stderr}"

	except subprocess.TimeoutExpired:
	return False, "Compilation timed out"
	except (OSError, subprocess.SubprocessError) as e:
	return False, f"Compilation error: {str(e)}"
	finally:
	try:
	if temp_file and os.path.exists(temp_file):
	os.unlink(temp_file)
	except OSError:
	pass

	def run_with_profiling(self, executable_path: str, args: List[str] = None) -> Dict:
	"""Run executable with rocprof profiling"""
	if not self.rocm_available:
	# Caller should use get_mock_profiling_data(kernel_name, iteration) directly.
	return {"success": False, "error": "ROCm not available; use get_mock_profiling_data(kernel_name, iteration) instead", "execution_time_ms": 0}

	try:
	if args is None:
	args = []

	# Run with rocprof stats timing
	cmd = [self.rocprof_path, '--stats', '--', executable_path] + args
	result = subprocess.run(
	cmd, capture_output=True, text=True, timeout=120, check=False)

	if result.returncode != 0:
	detail = result.stderr.strip() or result.stdout.strip(
	) or "rocprof exited with a non-zero status"
	return {
	"success": False,
	"error": f"Profiling failed: {detail}",
	"execution_time_ms": 0,
	}

	# Parse rocprof output
	profiling_data = self._parse_rocprof_output(
	result.stdout, result.stderr)

	return profiling_data

	except subprocess.TimeoutExpired:
	return {"success": False, "error": "Profiling timed out", "execution_time_ms": 0}
	except (OSError, subprocess.SubprocessError) as e:
	return {"success": False, "error": f"Profiling error: {str(e)}", "execution_time_ms": 0}

	def _parse_rocprof_output(self, stdout: str, _stderr: str) -> Dict:
	"""Parse rocprof --stats CSV output (Name,Calls,TotalDurationNs,AverageNs,Percentage)."""
	import csv
	import io
	try:
	metrics: Dict = {}
	reader = csv.DictReader(io.StringIO(stdout))
	for row in reader:
	name = row.get("Name", "")
	# Skip ROCm runtime helper kernels
	if "__amd_rocclr" in name:
	continue
	avg_ns_str = row.get("AverageNs", "") or ""
	if avg_ns_str.strip():
	avg_ns = float(avg_ns_str)
	if avg_ns > 0:
	metrics["execution_time_ms"] = round(
	avg_ns / 1_000_000, 6)
	metrics["memory_bandwidth_gbps"] = 0.0
	metrics["gpu_utilization_percent"] = 0.0
	metrics["sq_waves"] = 0
	break

	if not metrics:
	return {
	"success": False,
	"error": "rocprof output contained no parseable kernel rows",
	"execution_time_ms": 0,
	}

	metrics["success"] = True
	return metrics

	except Exception as e:
	return {
	"success": False,
	"error": f"Failed to parse rocprof output: {str(e)}",
	"execution_time_ms": 0,
	}

	def get_mock_profiling_data(self, kernel_name: str = "custom", iteration: int = 1) -> Dict:
	"""Public accessor for deterministic demo profiling data used by testing layer."""
	return self._get_demo_profiling_data(kernel_name, iteration)

	def _get_demo_profiling_data(self, kernel_name: str = "custom", iteration: int = 1) -> Dict:
	"""
	Return deterministic per-kernel demo profiling data.

	Replaces random.uniform() with representative MI300X values keyed by kernel name
	and iteration number. Every entry is tagged with data_source so the caller and
	the UI can show an honest provenance badge instead of fabricated numbers.
	"""
	from .demo_artifacts import get_demo_data
	data = get_demo_data(kernel_name, iteration)
	data['success'] = True
	return data

	def get_hardware_info(self) -> Dict:
	"""Get AMD GPU hardware information"""
	if not self.rocm_available:
	return {
	'gpu_name': 'AMD MI300X (Mock)',
	'compute_units': 120,
	'memory_size_gb': 192,
	'memory_bandwidth_tb_s': 5.3,
	'wavefront_size': 64
	}

	try:
	# Try to get real GPU info using rocminfo or similar
	cmd = ['rocminfo']
	result = subprocess.run(
	cmd, capture_output=True, text=True, timeout=10, check=False)

	if result.returncode == 0:
	return self._parse_rocminfo(result.stdout)
	else:
	return self._get_mock_hardware_info()

	except (OSError, subprocess.SubprocessError):
	return self._get_mock_hardware_info()

	def _parse_rocminfo(self, output: str) -> Dict:
	"""Parse rocminfo output to extract hardware info."""
	info = self._get_mock_hardware_info() # safe MI300X defaults
	name_match = re.search(r'^\s*Name:\s+(.+)$', output, re.MULTILINE)
	if name_match:
	info['gpu_name'] = name_match.group(1).strip()
	cu_match = re.search(r'^\s*Compute Unit:\s+(\d+)',
	output, re.MULTILINE)
	if cu_match:
	info['compute_units'] = int(cu_match.group(1))
	wf_match = re.search(
	r'^\s*Wavefront Size:\s+(\d+)', output, re.MULTILINE)
	if wf_match:
	info['wavefront_size'] = int(wf_match.group(1))
	return info

	def _get_mock_hardware_info(self) -> Dict:
	"""Mock hardware info for MI300X"""
	return {
	'gpu_name': 'AMD MI300X',
	'compute_units': 120,
	'memory_size_gb': 192,
	'memory_bandwidth_tb_s': 5.3,
	'wavefront_size': 64,
	'l2_cache_size_kb': 16384,
	'l1_cache_size_kb': 128
	}