fix: priority 1-4 debug pass — retry loop, SSE timeout, rocprof CSV parser, silent failures
Browse files- backend/agents/analyzer.py +3 -0
- backend/agents/coordinator.py +3 -0
- backend/agents/optimizer.py +3 -0
- backend/graph/pipeline.py +2 -1
- backend/main.py +7 -2
- backend/tools/demo_artifacts.py +0 -3
- backend/tools/rocprof_wrapper.py +51 -49
backend/agents/analyzer.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
# pylint: disable=broad-exception-caught
|
|
|
|
| 2 |
|
| 3 |
from ..models import AnalyzerResult, WorkloadType
|
| 4 |
from ..tools.llm_client import LLMClient
|
|
@@ -105,6 +106,8 @@ def run(cuda_code: str) -> AnalyzerResult:
|
|
| 105 |
)
|
| 106 |
data = safe_json_loads(raw)
|
| 107 |
except Exception:
|
|
|
|
|
|
|
| 108 |
# Fallback to static-scan-informed defaults on LLM/parse failure
|
| 109 |
data = {
|
| 110 |
"kernels_found": ["unknown_kernel"],
|
|
|
|
| 1 |
# pylint: disable=broad-exception-caught
|
| 2 |
+
import logging
|
| 3 |
|
| 4 |
from ..models import AnalyzerResult, WorkloadType
|
| 5 |
from ..tools.llm_client import LLMClient
|
|
|
|
| 106 |
)
|
| 107 |
data = safe_json_loads(raw)
|
| 108 |
except Exception:
|
| 109 |
+
logging.exception(
|
| 110 |
+
"Analyzer LLM call failed; falling back to static-scan defaults")
|
| 111 |
# Fallback to static-scan-informed defaults on LLM/parse failure
|
| 112 |
data = {
|
| 113 |
"kernels_found": ["unknown_kernel"],
|
backend/agents/coordinator.py
CHANGED
|
@@ -71,6 +71,9 @@ def simplify_explanation(report: FinalReport) -> str:
|
|
| 71 |
return simple_text
|
| 72 |
|
| 73 |
|
|
|
|
|
|
|
|
|
|
| 74 |
async def run_pipeline(
|
| 75 |
cuda_code: str,
|
| 76 |
kernel_name: str = "custom",
|
|
|
|
| 71 |
return simple_text
|
| 72 |
|
| 73 |
|
| 74 |
+
# NOTE: run_pipeline below is NOT used by the active LangGraph pipeline.
|
| 75 |
+
# The active pipeline is backend/graph/pipeline.py (build_pipeline / pipeline).
|
| 76 |
+
# This function is kept for reference but is dead code.
|
| 77 |
async def run_pipeline(
|
| 78 |
cuda_code: str,
|
| 79 |
kernel_name: str = "custom",
|
backend/agents/optimizer.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
# pylint: disable=broad-exception-caught
|
|
|
|
| 2 |
|
| 3 |
from ..models import OptimizerResult, AnalyzerResult, WorkloadType
|
| 4 |
from ..tools.llm_client import LLMClient
|
|
@@ -77,6 +78,8 @@ Try a DIFFERENT strategy. If you applied shared memory tiling, try memory coales
|
|
| 77 |
)
|
| 78 |
data = safe_json_loads(raw)
|
| 79 |
except Exception:
|
|
|
|
|
|
|
| 80 |
# Fallback to original hip_code if LLM fails
|
| 81 |
data = {
|
| 82 |
"optimized_code": hip_code,
|
|
|
|
| 1 |
# pylint: disable=broad-exception-caught
|
| 2 |
+
import logging
|
| 3 |
|
| 4 |
from ..models import OptimizerResult, AnalyzerResult, WorkloadType
|
| 5 |
from ..tools.llm_client import LLMClient
|
|
|
|
| 78 |
)
|
| 79 |
data = safe_json_loads(raw)
|
| 80 |
except Exception:
|
| 81 |
+
logging.exception(
|
| 82 |
+
"Optimizer LLM call failed; returning unmodified hip_code")
|
| 83 |
# Fallback to original hip_code if LLM fails
|
| 84 |
data = {
|
| 85 |
"optimized_code": hip_code,
|
backend/graph/pipeline.py
CHANGED
|
@@ -446,7 +446,8 @@ def should_retry_decision(state: MigrationState) -> Literal["retry", "done"]:
|
|
| 446 |
return "done"
|
| 447 |
if not getattr(tester_result, "success", True):
|
| 448 |
return "done" # hard compile/run failure — let coordinator report it
|
| 449 |
-
|
|
|
|
| 450 |
iteration = state.get("iteration", 0)
|
| 451 |
max_iter = state.get("max_iterations", 3)
|
| 452 |
if speedup < 0.95 and iteration < max_iter:
|
|
|
|
| 446 |
return "done"
|
| 447 |
if not getattr(tester_result, "success", True):
|
| 448 |
return "done" # hard compile/run failure — let coordinator report it
|
| 449 |
+
raw = getattr(tester_result, "speedup", None)
|
| 450 |
+
speedup = float(raw) if raw is not None else 1.0
|
| 451 |
iteration = state.get("iteration", 0)
|
| 452 |
max_iter = state.get("max_iterations", 3)
|
| 453 |
if speedup < 0.95 and iteration < max_iter:
|
backend/main.py
CHANGED
|
@@ -145,7 +145,11 @@ async def port_cuda_code(req: PortRequest):
|
|
| 145 |
task = asyncio.create_task(_run_graph())
|
| 146 |
try:
|
| 147 |
while True:
|
| 148 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
if event is None:
|
| 150 |
yield "data: [DONE]\n\n"
|
| 151 |
break
|
|
@@ -423,7 +427,8 @@ async def list_demo_kernels():
|
|
| 423 |
# Serve compiled frontend when available; fall back to the source folder for dev.
|
| 424 |
frontend_root = os.path.join(os.path.dirname(__file__), "..", "frontend")
|
| 425 |
frontend_dist = os.path.join(frontend_root, "dist")
|
| 426 |
-
frontend_path = frontend_dist if os.path.exists(
|
|
|
|
| 427 |
if os.path.exists(frontend_path):
|
| 428 |
app.mount("/", StaticFiles(directory=frontend_path,
|
| 429 |
html=True), name="frontend")
|
|
|
|
| 145 |
task = asyncio.create_task(_run_graph())
|
| 146 |
try:
|
| 147 |
while True:
|
| 148 |
+
try:
|
| 149 |
+
event = await asyncio.wait_for(queue.get(), timeout=120.0)
|
| 150 |
+
except asyncio.TimeoutError:
|
| 151 |
+
yield "data: [DONE]\n\n"
|
| 152 |
+
break
|
| 153 |
if event is None:
|
| 154 |
yield "data: [DONE]\n\n"
|
| 155 |
break
|
|
|
|
| 427 |
# Serve compiled frontend when available; fall back to the source folder for dev.
|
| 428 |
frontend_root = os.path.join(os.path.dirname(__file__), "..", "frontend")
|
| 429 |
frontend_dist = os.path.join(frontend_root, "dist")
|
| 430 |
+
frontend_path = frontend_dist if os.path.exists(
|
| 431 |
+
frontend_dist) else frontend_root
|
| 432 |
if os.path.exists(frontend_path):
|
| 433 |
app.mount("/", StaticFiles(directory=frontend_path,
|
| 434 |
html=True), name="frontend")
|
backend/tools/demo_artifacts.py
CHANGED
|
@@ -20,9 +20,6 @@ from typing import Dict
|
|
| 20 |
# - Iteration 1: optimizer applies first strategy
|
| 21 |
# - Iteration 2 (where shown): fallback strategy after profiler-detected regression
|
| 22 |
# - All times in milliseconds, bandwidth in GB/s
|
| 23 |
-
#
|
| 24 |
-
# These are representative of the kernel class behaviour, not exact measurements.
|
| 25 |
-
# Real numbers require ROCM_AVAILABLE=true on actual MI300X hardware.
|
| 26 |
# ---------------------------------------------------------------------------
|
| 27 |
|
| 28 |
KERNEL_DEMO_DATA: Dict[str, Dict] = {
|
|
|
|
| 20 |
# - Iteration 1: optimizer applies first strategy
|
| 21 |
# - Iteration 2 (where shown): fallback strategy after profiler-detected regression
|
| 22 |
# - All times in milliseconds, bandwidth in GB/s
|
|
|
|
|
|
|
|
|
|
| 23 |
# ---------------------------------------------------------------------------
|
| 24 |
|
| 25 |
KERNEL_DEMO_DATA: Dict[str, Dict] = {
|
backend/tools/rocprof_wrapper.py
CHANGED
|
@@ -59,21 +59,21 @@ class RocprofWrapper:
|
|
| 59 |
def run_with_profiling(self, executable_path: str, args: List[str] = None) -> Dict:
|
| 60 |
"""Run executable with rocprof profiling"""
|
| 61 |
if not self.rocm_available:
|
| 62 |
-
#
|
| 63 |
-
return
|
| 64 |
|
| 65 |
try:
|
| 66 |
if args is None:
|
| 67 |
args = []
|
| 68 |
|
| 69 |
-
# Run with rocprof
|
| 70 |
-
cmd = [self.rocprof_path, '-
|
| 71 |
-
[executable_path] + args
|
| 72 |
result = subprocess.run(
|
| 73 |
cmd, capture_output=True, text=True, timeout=120, check=False)
|
| 74 |
|
| 75 |
if result.returncode != 0:
|
| 76 |
-
detail = result.stderr.strip() or result.stdout.strip(
|
|
|
|
| 77 |
return {
|
| 78 |
"success": False,
|
| 79 |
"error": f"Profiling failed: {detail}",
|
|
@@ -92,51 +92,43 @@ class RocprofWrapper:
|
|
| 92 |
return {"success": False, "error": f"Profiling error: {str(e)}", "execution_time_ms": 0}
|
| 93 |
|
| 94 |
def _parse_rocprof_output(self, stdout: str, _stderr: str) -> Dict:
|
| 95 |
-
"""Parse rocprof
|
|
|
|
|
|
|
| 96 |
try:
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
if util_match:
|
| 116 |
-
metrics['gpu_utilization_percent'] = float(util_match.group(1))
|
| 117 |
-
|
| 118 |
-
# Parse wavefront count
|
| 119 |
-
wave_match = re.search(r'SQ_WAVES:\s+(\d+)', stdout)
|
| 120 |
-
if wave_match:
|
| 121 |
-
metrics['sq_waves'] = int(wave_match.group(1))
|
| 122 |
-
|
| 123 |
-
# If no metrics found, return basic execution info
|
| 124 |
if not metrics:
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
'sq_waves': 1024
|
| 130 |
}
|
| 131 |
|
| 132 |
-
metrics[
|
| 133 |
return metrics
|
| 134 |
|
| 135 |
-
except
|
| 136 |
return {
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
}
|
| 141 |
|
| 142 |
def get_mock_profiling_data(self, kernel_name: str = "custom", iteration: int = 1) -> Dict:
|
|
@@ -181,11 +173,21 @@ class RocprofWrapper:
|
|
| 181 |
except (OSError, subprocess.SubprocessError):
|
| 182 |
return self._get_mock_hardware_info()
|
| 183 |
|
| 184 |
-
def _parse_rocminfo(self,
|
| 185 |
-
"""Parse rocminfo output"""
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
|
| 190 |
def _get_mock_hardware_info(self) -> Dict:
|
| 191 |
"""Mock hardware info for MI300X"""
|
|
|
|
| 59 |
def run_with_profiling(self, executable_path: str, args: List[str] = None) -> Dict:
|
| 60 |
"""Run executable with rocprof profiling"""
|
| 61 |
if not self.rocm_available:
|
| 62 |
+
# Caller should use get_mock_profiling_data(kernel_name, iteration) directly.
|
| 63 |
+
return {"success": False, "error": "ROCm not available; use get_mock_profiling_data(kernel_name, iteration) instead", "execution_time_ms": 0}
|
| 64 |
|
| 65 |
try:
|
| 66 |
if args is None:
|
| 67 |
args = []
|
| 68 |
|
| 69 |
+
# Run with rocprof stats timing
|
| 70 |
+
cmd = [self.rocprof_path, '--stats', '--', executable_path] + args
|
|
|
|
| 71 |
result = subprocess.run(
|
| 72 |
cmd, capture_output=True, text=True, timeout=120, check=False)
|
| 73 |
|
| 74 |
if result.returncode != 0:
|
| 75 |
+
detail = result.stderr.strip() or result.stdout.strip(
|
| 76 |
+
) or "rocprof exited with a non-zero status"
|
| 77 |
return {
|
| 78 |
"success": False,
|
| 79 |
"error": f"Profiling failed: {detail}",
|
|
|
|
| 92 |
return {"success": False, "error": f"Profiling error: {str(e)}", "execution_time_ms": 0}
|
| 93 |
|
| 94 |
def _parse_rocprof_output(self, stdout: str, _stderr: str) -> Dict:
|
| 95 |
+
"""Parse rocprof --stats CSV output (Name,Calls,TotalDurationNs,AverageNs,Percentage)."""
|
| 96 |
+
import csv
|
| 97 |
+
import io
|
| 98 |
try:
|
| 99 |
+
metrics: Dict = {}
|
| 100 |
+
reader = csv.DictReader(io.StringIO(stdout))
|
| 101 |
+
for row in reader:
|
| 102 |
+
name = row.get("Name", "")
|
| 103 |
+
# Skip ROCm runtime helper kernels
|
| 104 |
+
if "__amd_rocclr" in name:
|
| 105 |
+
continue
|
| 106 |
+
avg_ns_str = row.get("AverageNs", "") or ""
|
| 107 |
+
if avg_ns_str.strip():
|
| 108 |
+
avg_ns = float(avg_ns_str)
|
| 109 |
+
if avg_ns > 0:
|
| 110 |
+
metrics["execution_time_ms"] = round(
|
| 111 |
+
avg_ns / 1_000_000, 6)
|
| 112 |
+
metrics["memory_bandwidth_gbps"] = 0.0
|
| 113 |
+
metrics["gpu_utilization_percent"] = 0.0
|
| 114 |
+
metrics["sq_waves"] = 0
|
| 115 |
+
break
|
| 116 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
if not metrics:
|
| 118 |
+
return {
|
| 119 |
+
"success": False,
|
| 120 |
+
"error": "rocprof output contained no parseable kernel rows",
|
| 121 |
+
"execution_time_ms": 0,
|
|
|
|
| 122 |
}
|
| 123 |
|
| 124 |
+
metrics["success"] = True
|
| 125 |
return metrics
|
| 126 |
|
| 127 |
+
except Exception as e:
|
| 128 |
return {
|
| 129 |
+
"success": False,
|
| 130 |
+
"error": f"Failed to parse rocprof output: {str(e)}",
|
| 131 |
+
"execution_time_ms": 0,
|
| 132 |
}
|
| 133 |
|
| 134 |
def get_mock_profiling_data(self, kernel_name: str = "custom", iteration: int = 1) -> Dict:
|
|
|
|
| 173 |
except (OSError, subprocess.SubprocessError):
|
| 174 |
return self._get_mock_hardware_info()
|
| 175 |
|
| 176 |
+
def _parse_rocminfo(self, output: str) -> Dict:
|
| 177 |
+
"""Parse rocminfo output to extract hardware info."""
|
| 178 |
+
info = self._get_mock_hardware_info() # safe MI300X defaults
|
| 179 |
+
name_match = re.search(r'^\s*Name:\s+(.+)$', output, re.MULTILINE)
|
| 180 |
+
if name_match:
|
| 181 |
+
info['gpu_name'] = name_match.group(1).strip()
|
| 182 |
+
cu_match = re.search(r'^\s*Compute Unit:\s+(\d+)',
|
| 183 |
+
output, re.MULTILINE)
|
| 184 |
+
if cu_match:
|
| 185 |
+
info['compute_units'] = int(cu_match.group(1))
|
| 186 |
+
wf_match = re.search(
|
| 187 |
+
r'^\s*Wavefront Size:\s+(\d+)', output, re.MULTILINE)
|
| 188 |
+
if wf_match:
|
| 189 |
+
info['wavefront_size'] = int(wf_match.group(1))
|
| 190 |
+
return info
|
| 191 |
|
| 192 |
def _get_mock_hardware_info(self) -> Dict:
|
| 193 |
"""Mock hardware info for MI300X"""
|