tazwarrrr commited on
Commit
0b5416e
·
1 Parent(s): 7982eca

fix: priority 1-4 debug pass — retry loop, SSE timeout, rocprof CSV parser, silent failures

Browse files
backend/agents/analyzer.py CHANGED
@@ -1,4 +1,5 @@
1
  # pylint: disable=broad-exception-caught
 
2
 
3
  from ..models import AnalyzerResult, WorkloadType
4
  from ..tools.llm_client import LLMClient
@@ -105,6 +106,8 @@ def run(cuda_code: str) -> AnalyzerResult:
105
  )
106
  data = safe_json_loads(raw)
107
  except Exception:
 
 
108
  # Fallback to static-scan-informed defaults on LLM/parse failure
109
  data = {
110
  "kernels_found": ["unknown_kernel"],
 
1
  # pylint: disable=broad-exception-caught
2
+ import logging
3
 
4
  from ..models import AnalyzerResult, WorkloadType
5
  from ..tools.llm_client import LLMClient
 
106
  )
107
  data = safe_json_loads(raw)
108
  except Exception:
109
+ logging.exception(
110
+ "Analyzer LLM call failed; falling back to static-scan defaults")
111
  # Fallback to static-scan-informed defaults on LLM/parse failure
112
  data = {
113
  "kernels_found": ["unknown_kernel"],
backend/agents/coordinator.py CHANGED
@@ -71,6 +71,9 @@ def simplify_explanation(report: FinalReport) -> str:
71
  return simple_text
72
 
73
 
 
 
 
74
  async def run_pipeline(
75
  cuda_code: str,
76
  kernel_name: str = "custom",
 
71
  return simple_text
72
 
73
 
74
+ # NOTE: run_pipeline below is NOT used by the active LangGraph pipeline.
75
+ # The active pipeline is backend/graph/pipeline.py (build_pipeline / pipeline).
76
+ # This function is kept for reference but is dead code.
77
  async def run_pipeline(
78
  cuda_code: str,
79
  kernel_name: str = "custom",
backend/agents/optimizer.py CHANGED
@@ -1,4 +1,5 @@
1
  # pylint: disable=broad-exception-caught
 
2
 
3
  from ..models import OptimizerResult, AnalyzerResult, WorkloadType
4
  from ..tools.llm_client import LLMClient
@@ -77,6 +78,8 @@ Try a DIFFERENT strategy. If you applied shared memory tiling, try memory coales
77
  )
78
  data = safe_json_loads(raw)
79
  except Exception:
 
 
80
  # Fallback to original hip_code if LLM fails
81
  data = {
82
  "optimized_code": hip_code,
 
1
  # pylint: disable=broad-exception-caught
2
+ import logging
3
 
4
  from ..models import OptimizerResult, AnalyzerResult, WorkloadType
5
  from ..tools.llm_client import LLMClient
 
78
  )
79
  data = safe_json_loads(raw)
80
  except Exception:
81
+ logging.exception(
82
+ "Optimizer LLM call failed; returning unmodified hip_code")
83
  # Fallback to original hip_code if LLM fails
84
  data = {
85
  "optimized_code": hip_code,
backend/graph/pipeline.py CHANGED
@@ -446,7 +446,8 @@ def should_retry_decision(state: MigrationState) -> Literal["retry", "done"]:
446
  return "done"
447
  if not getattr(tester_result, "success", True):
448
  return "done" # hard compile/run failure — let coordinator report it
449
- speedup = float(getattr(tester_result, "speedup", 1.0) or 1.0)
 
450
  iteration = state.get("iteration", 0)
451
  max_iter = state.get("max_iterations", 3)
452
  if speedup < 0.95 and iteration < max_iter:
 
446
  return "done"
447
  if not getattr(tester_result, "success", True):
448
  return "done" # hard compile/run failure — let coordinator report it
449
+ raw = getattr(tester_result, "speedup", None)
450
+ speedup = float(raw) if raw is not None else 1.0
451
  iteration = state.get("iteration", 0)
452
  max_iter = state.get("max_iterations", 3)
453
  if speedup < 0.95 and iteration < max_iter:
backend/main.py CHANGED
@@ -145,7 +145,11 @@ async def port_cuda_code(req: PortRequest):
145
  task = asyncio.create_task(_run_graph())
146
  try:
147
  while True:
148
- event = await queue.get()
 
 
 
 
149
  if event is None:
150
  yield "data: [DONE]\n\n"
151
  break
@@ -423,7 +427,8 @@ async def list_demo_kernels():
423
  # Serve compiled frontend when available; fall back to the source folder for dev.
424
  frontend_root = os.path.join(os.path.dirname(__file__), "..", "frontend")
425
  frontend_dist = os.path.join(frontend_root, "dist")
426
- frontend_path = frontend_dist if os.path.exists(frontend_dist) else frontend_root
 
427
  if os.path.exists(frontend_path):
428
  app.mount("/", StaticFiles(directory=frontend_path,
429
  html=True), name="frontend")
 
145
  task = asyncio.create_task(_run_graph())
146
  try:
147
  while True:
148
+ try:
149
+ event = await asyncio.wait_for(queue.get(), timeout=120.0)
150
+ except asyncio.TimeoutError:
151
+ yield "data: [DONE]\n\n"
152
+ break
153
  if event is None:
154
  yield "data: [DONE]\n\n"
155
  break
 
427
  # Serve compiled frontend when available; fall back to the source folder for dev.
428
  frontend_root = os.path.join(os.path.dirname(__file__), "..", "frontend")
429
  frontend_dist = os.path.join(frontend_root, "dist")
430
+ frontend_path = frontend_dist if os.path.exists(
431
+ frontend_dist) else frontend_root
432
  if os.path.exists(frontend_path):
433
  app.mount("/", StaticFiles(directory=frontend_path,
434
  html=True), name="frontend")
backend/tools/demo_artifacts.py CHANGED
@@ -20,9 +20,6 @@ from typing import Dict
20
  # - Iteration 1: optimizer applies first strategy
21
  # - Iteration 2 (where shown): fallback strategy after profiler-detected regression
22
  # - All times in milliseconds, bandwidth in GB/s
23
- #
24
- # These are representative of the kernel class behaviour, not exact measurements.
25
- # Real numbers require ROCM_AVAILABLE=true on actual MI300X hardware.
26
  # ---------------------------------------------------------------------------
27
 
28
  KERNEL_DEMO_DATA: Dict[str, Dict] = {
 
20
  # - Iteration 1: optimizer applies first strategy
21
  # - Iteration 2 (where shown): fallback strategy after profiler-detected regression
22
  # - All times in milliseconds, bandwidth in GB/s
 
 
 
23
  # ---------------------------------------------------------------------------
24
 
25
  KERNEL_DEMO_DATA: Dict[str, Dict] = {
backend/tools/rocprof_wrapper.py CHANGED
@@ -59,21 +59,21 @@ class RocprofWrapper:
59
  def run_with_profiling(self, executable_path: str, args: List[str] = None) -> Dict:
60
  """Run executable with rocprof profiling"""
61
  if not self.rocm_available:
62
- # Return mock profiling data
63
- return self.get_mock_profiling_data()
64
 
65
  try:
66
  if args is None:
67
  args = []
68
 
69
- # Run with rocprof
70
- cmd = [self.rocprof_path, '-i', 'default', '--'] + \
71
- [executable_path] + args
72
  result = subprocess.run(
73
  cmd, capture_output=True, text=True, timeout=120, check=False)
74
 
75
  if result.returncode != 0:
76
- detail = result.stderr.strip() or result.stdout.strip() or "rocprof exited with a non-zero status"
 
77
  return {
78
  "success": False,
79
  "error": f"Profiling failed: {detail}",
@@ -92,51 +92,43 @@ class RocprofWrapper:
92
  return {"success": False, "error": f"Profiling error: {str(e)}", "execution_time_ms": 0}
93
 
94
  def _parse_rocprof_output(self, stdout: str, _stderr: str) -> Dict:
95
- """Parse rocprof output to extract metrics"""
 
 
96
  try:
97
- # Look for key metrics in rocprof output
98
- metrics = {}
99
-
100
- # Parse execution time
101
- time_match = re.search(
102
- r'Kernel execution time:\s+(\d+\.\d+)\s*ms', stdout)
103
- if time_match:
104
- metrics['execution_time_ms'] = float(time_match.group(1))
105
-
106
- # Parse memory bandwidth
107
- bandwidth_match = re.search(
108
- r'Memory bandwidth:\s+(\d+\.\d+)\s*GB/s', stdout)
109
- if bandwidth_match:
110
- metrics['memory_bandwidth_gbps'] = float(
111
- bandwidth_match.group(1))
112
-
113
- # Parse GPU utilization
114
- util_match = re.search(r'GPU utilization:\s+(\d+\.\d+)%', stdout)
115
- if util_match:
116
- metrics['gpu_utilization_percent'] = float(util_match.group(1))
117
-
118
- # Parse wavefront count
119
- wave_match = re.search(r'SQ_WAVES:\s+(\d+)', stdout)
120
- if wave_match:
121
- metrics['sq_waves'] = int(wave_match.group(1))
122
-
123
- # If no metrics found, return basic execution info
124
  if not metrics:
125
- metrics = {
126
- 'execution_time_ms': 100.0, # Default mock value
127
- 'memory_bandwidth_gbps': 50.0,
128
- 'gpu_utilization_percent': 75.0,
129
- 'sq_waves': 1024
130
  }
131
 
132
- metrics['success'] = True
133
  return metrics
134
 
135
- except (TypeError, ValueError) as e:
136
  return {
137
- 'success': False,
138
- 'error': f'Failed to parse rocprof output: {str(e)}',
139
- 'execution_time_ms': 0
140
  }
141
 
142
  def get_mock_profiling_data(self, kernel_name: str = "custom", iteration: int = 1) -> Dict:
@@ -181,11 +173,21 @@ class RocprofWrapper:
181
  except (OSError, subprocess.SubprocessError):
182
  return self._get_mock_hardware_info()
183
 
184
- def _parse_rocminfo(self, _output: str) -> Dict:
185
- """Parse rocminfo output"""
186
- # This would parse real rocminfo output
187
- # For now, return mock data
188
- return self._get_mock_hardware_info()
 
 
 
 
 
 
 
 
 
 
189
 
190
  def _get_mock_hardware_info(self) -> Dict:
191
  """Mock hardware info for MI300X"""
 
59
  def run_with_profiling(self, executable_path: str, args: List[str] = None) -> Dict:
60
  """Run executable with rocprof profiling"""
61
  if not self.rocm_available:
62
+ # Caller should use get_mock_profiling_data(kernel_name, iteration) directly.
63
+ return {"success": False, "error": "ROCm not available; use get_mock_profiling_data(kernel_name, iteration) instead", "execution_time_ms": 0}
64
 
65
  try:
66
  if args is None:
67
  args = []
68
 
69
+ # Run with rocprof stats timing
70
+ cmd = [self.rocprof_path, '--stats', '--', executable_path] + args
 
71
  result = subprocess.run(
72
  cmd, capture_output=True, text=True, timeout=120, check=False)
73
 
74
  if result.returncode != 0:
75
+ detail = result.stderr.strip() or result.stdout.strip(
76
+ ) or "rocprof exited with a non-zero status"
77
  return {
78
  "success": False,
79
  "error": f"Profiling failed: {detail}",
 
92
  return {"success": False, "error": f"Profiling error: {str(e)}", "execution_time_ms": 0}
93
 
94
  def _parse_rocprof_output(self, stdout: str, _stderr: str) -> Dict:
95
+ """Parse rocprof --stats CSV output (Name,Calls,TotalDurationNs,AverageNs,Percentage)."""
96
+ import csv
97
+ import io
98
  try:
99
+ metrics: Dict = {}
100
+ reader = csv.DictReader(io.StringIO(stdout))
101
+ for row in reader:
102
+ name = row.get("Name", "")
103
+ # Skip ROCm runtime helper kernels
104
+ if "__amd_rocclr" in name:
105
+ continue
106
+ avg_ns_str = row.get("AverageNs", "") or ""
107
+ if avg_ns_str.strip():
108
+ avg_ns = float(avg_ns_str)
109
+ if avg_ns > 0:
110
+ metrics["execution_time_ms"] = round(
111
+ avg_ns / 1_000_000, 6)
112
+ metrics["memory_bandwidth_gbps"] = 0.0
113
+ metrics["gpu_utilization_percent"] = 0.0
114
+ metrics["sq_waves"] = 0
115
+ break
116
+
 
 
 
 
 
 
 
 
 
117
  if not metrics:
118
+ return {
119
+ "success": False,
120
+ "error": "rocprof output contained no parseable kernel rows",
121
+ "execution_time_ms": 0,
 
122
  }
123
 
124
+ metrics["success"] = True
125
  return metrics
126
 
127
+ except Exception as e:
128
  return {
129
+ "success": False,
130
+ "error": f"Failed to parse rocprof output: {str(e)}",
131
+ "execution_time_ms": 0,
132
  }
133
 
134
  def get_mock_profiling_data(self, kernel_name: str = "custom", iteration: int = 1) -> Dict:
 
173
  except (OSError, subprocess.SubprocessError):
174
  return self._get_mock_hardware_info()
175
 
176
+ def _parse_rocminfo(self, output: str) -> Dict:
177
+ """Parse rocminfo output to extract hardware info."""
178
+ info = self._get_mock_hardware_info() # safe MI300X defaults
179
+ name_match = re.search(r'^\s*Name:\s+(.+)$', output, re.MULTILINE)
180
+ if name_match:
181
+ info['gpu_name'] = name_match.group(1).strip()
182
+ cu_match = re.search(r'^\s*Compute Unit:\s+(\d+)',
183
+ output, re.MULTILINE)
184
+ if cu_match:
185
+ info['compute_units'] = int(cu_match.group(1))
186
+ wf_match = re.search(
187
+ r'^\s*Wavefront Size:\s+(\d+)', output, re.MULTILINE)
188
+ if wf_match:
189
+ info['wavefront_size'] = int(wf_match.group(1))
190
+ return info
191
 
192
  def _get_mock_hardware_info(self) -> Dict:
193
  """Mock hardware info for MI300X"""