Spaces:

lablab-ai-amd-developer-hackathon
/

ROCmPort-AI

Running

App Files Files Community

tazwarrrr commited on 20 days ago

Commit

0b5416e

1 Parent(s): 7982eca

fix: priority 1-4 debug pass — retry loop, SSE timeout, rocprof CSV parser, silent failures

Browse files

Files changed (7) hide show

backend/agents/analyzer.py +3 -0
backend/agents/coordinator.py +3 -0
backend/agents/optimizer.py +3 -0
backend/graph/pipeline.py +2 -1
backend/main.py +7 -2
backend/tools/demo_artifacts.py +0 -3
backend/tools/rocprof_wrapper.py +51 -49

backend/agents/analyzer.py CHANGED Viewed

@@ -1,4 +1,5 @@
 # pylint: disable=broad-exception-caught
 from ..models import AnalyzerResult, WorkloadType
 from ..tools.llm_client import LLMClient
@@ -105,6 +106,8 @@ def run(cuda_code: str) -> AnalyzerResult:
         )
         data = safe_json_loads(raw)
     except Exception:
         # Fallback to static-scan-informed defaults on LLM/parse failure
         data = {
             "kernels_found": ["unknown_kernel"],

 # pylint: disable=broad-exception-caught
+import logging
 from ..models import AnalyzerResult, WorkloadType
 from ..tools.llm_client import LLMClient
         )
         data = safe_json_loads(raw)
     except Exception:
+        logging.exception(
+            "Analyzer LLM call failed; falling back to static-scan defaults")
         # Fallback to static-scan-informed defaults on LLM/parse failure
         data = {
             "kernels_found": ["unknown_kernel"],

backend/agents/coordinator.py CHANGED Viewed

@@ -71,6 +71,9 @@ def simplify_explanation(report: FinalReport) -> str:
     return simple_text
 async def run_pipeline(
     cuda_code: str,
     kernel_name: str = "custom",

     return simple_text
+# NOTE: run_pipeline below is NOT used by the active LangGraph pipeline.
+# The active pipeline is backend/graph/pipeline.py (build_pipeline / pipeline).
+# This function is kept for reference but is dead code.
 async def run_pipeline(
     cuda_code: str,
     kernel_name: str = "custom",

backend/agents/optimizer.py CHANGED Viewed

@@ -1,4 +1,5 @@
 # pylint: disable=broad-exception-caught
 from ..models import OptimizerResult, AnalyzerResult, WorkloadType
 from ..tools.llm_client import LLMClient
@@ -77,6 +78,8 @@ Try a DIFFERENT strategy. If you applied shared memory tiling, try memory coales
         )
         data = safe_json_loads(raw)
     except Exception:
         # Fallback to original hip_code if LLM fails
         data = {
             "optimized_code": hip_code,

 # pylint: disable=broad-exception-caught
+import logging
 from ..models import OptimizerResult, AnalyzerResult, WorkloadType
 from ..tools.llm_client import LLMClient
         )
         data = safe_json_loads(raw)
     except Exception:
+        logging.exception(
+            "Optimizer LLM call failed; returning unmodified hip_code")
         # Fallback to original hip_code if LLM fails
         data = {
             "optimized_code": hip_code,

backend/graph/pipeline.py CHANGED Viewed

@@ -446,7 +446,8 @@ def should_retry_decision(state: MigrationState) -> Literal["retry", "done"]:
         return "done"
     if not getattr(tester_result, "success", True):
         return "done"  # hard compile/run failure — let coordinator report it
-    speedup = float(getattr(tester_result, "speedup", 1.0) or 1.0)
     iteration = state.get("iteration", 0)
     max_iter = state.get("max_iterations", 3)
     if speedup < 0.95 and iteration < max_iter:

         return "done"
     if not getattr(tester_result, "success", True):
         return "done"  # hard compile/run failure — let coordinator report it
+    raw = getattr(tester_result, "speedup", None)
+    speedup = float(raw) if raw is not None else 1.0
     iteration = state.get("iteration", 0)
     max_iter = state.get("max_iterations", 3)
     if speedup < 0.95 and iteration < max_iter:

backend/main.py CHANGED Viewed

@@ -145,7 +145,11 @@ async def port_cuda_code(req: PortRequest):
         task = asyncio.create_task(_run_graph())
         try:
             while True:
-                event = await queue.get()
                 if event is None:
                     yield "data: [DONE]\n\n"
                     break
@@ -423,7 +427,8 @@ async def list_demo_kernels():
 # Serve compiled frontend when available; fall back to the source folder for dev.
 frontend_root = os.path.join(os.path.dirname(__file__), "..", "frontend")
 frontend_dist = os.path.join(frontend_root, "dist")
-frontend_path = frontend_dist if os.path.exists(frontend_dist) else frontend_root
 if os.path.exists(frontend_path):
     app.mount("/", StaticFiles(directory=frontend_path,
               html=True), name="frontend")

         task = asyncio.create_task(_run_graph())
         try:
             while True:
+                try:
+                    event = await asyncio.wait_for(queue.get(), timeout=120.0)
+                except asyncio.TimeoutError:
+                    yield "data: [DONE]\n\n"
+                    break
                 if event is None:
                     yield "data: [DONE]\n\n"
                     break
 # Serve compiled frontend when available; fall back to the source folder for dev.
 frontend_root = os.path.join(os.path.dirname(__file__), "..", "frontend")
 frontend_dist = os.path.join(frontend_root, "dist")
+frontend_path = frontend_dist if os.path.exists(
+    frontend_dist) else frontend_root
 if os.path.exists(frontend_path):
     app.mount("/", StaticFiles(directory=frontend_path,
               html=True), name="frontend")

backend/tools/demo_artifacts.py CHANGED Viewed

@@ -20,9 +20,6 @@ from typing import Dict
 #   - Iteration 1: optimizer applies first strategy
 #   - Iteration 2 (where shown): fallback strategy after profiler-detected regression
 #   - All times in milliseconds, bandwidth in GB/s
-#
-# These are representative of the kernel class behaviour, not exact measurements.
-# Real numbers require ROCM_AVAILABLE=true on actual MI300X hardware.
 # ---------------------------------------------------------------------------
 KERNEL_DEMO_DATA: Dict[str, Dict] = {

 #   - Iteration 1: optimizer applies first strategy
 #   - Iteration 2 (where shown): fallback strategy after profiler-detected regression
 #   - All times in milliseconds, bandwidth in GB/s
 # ---------------------------------------------------------------------------
 KERNEL_DEMO_DATA: Dict[str, Dict] = {

backend/tools/rocprof_wrapper.py CHANGED Viewed

@@ -59,21 +59,21 @@ class RocprofWrapper:
     def run_with_profiling(self, executable_path: str, args: List[str] = None) -> Dict:
         """Run executable with rocprof profiling"""
         if not self.rocm_available:
-            # Return mock profiling data
-            return self.get_mock_profiling_data()
         try:
             if args is None:
                 args = []
-            # Run with rocprof
-            cmd = [self.rocprof_path, '-i', 'default', '--'] + \
-                [executable_path] + args
             result = subprocess.run(
                 cmd, capture_output=True, text=True, timeout=120, check=False)
             if result.returncode != 0:
-                detail = result.stderr.strip() or result.stdout.strip() or "rocprof exited with a non-zero status"
                 return {
                     "success": False,
                     "error": f"Profiling failed: {detail}",
@@ -92,51 +92,43 @@ class RocprofWrapper:
             return {"success": False, "error": f"Profiling error: {str(e)}", "execution_time_ms": 0}
     def _parse_rocprof_output(self, stdout: str, _stderr: str) -> Dict:
-        """Parse rocprof output to extract metrics"""
         try:
-            # Look for key metrics in rocprof output
-            metrics = {}
-            # Parse execution time
-            time_match = re.search(
-                r'Kernel execution time:\s+(\d+\.\d+)\s*ms', stdout)
-            if time_match:
-                metrics['execution_time_ms'] = float(time_match.group(1))
-            # Parse memory bandwidth
-            bandwidth_match = re.search(
-                r'Memory bandwidth:\s+(\d+\.\d+)\s*GB/s', stdout)
-            if bandwidth_match:
-                metrics['memory_bandwidth_gbps'] = float(
-                    bandwidth_match.group(1))
-            # Parse GPU utilization
-            util_match = re.search(r'GPU utilization:\s+(\d+\.\d+)%', stdout)
-            if util_match:
-                metrics['gpu_utilization_percent'] = float(util_match.group(1))
-            # Parse wavefront count
-            wave_match = re.search(r'SQ_WAVES:\s+(\d+)', stdout)
-            if wave_match:
-                metrics['sq_waves'] = int(wave_match.group(1))
-            # If no metrics found, return basic execution info
             if not metrics:
-                metrics = {
-                    'execution_time_ms': 100.0,  # Default mock value
-                    'memory_bandwidth_gbps': 50.0,
-                    'gpu_utilization_percent': 75.0,
-                    'sq_waves': 1024
                 }
-            metrics['success'] = True
             return metrics
-        except (TypeError, ValueError) as e:
             return {
-                'success': False,
-                'error': f'Failed to parse rocprof output: {str(e)}',
-                'execution_time_ms': 0
             }
     def get_mock_profiling_data(self, kernel_name: str = "custom", iteration: int = 1) -> Dict:
@@ -181,11 +173,21 @@ class RocprofWrapper:
         except (OSError, subprocess.SubprocessError):
             return self._get_mock_hardware_info()
-    def _parse_rocminfo(self, _output: str) -> Dict:
-        """Parse rocminfo output"""
-        # This would parse real rocminfo output
-        # For now, return mock data
-        return self._get_mock_hardware_info()
     def _get_mock_hardware_info(self) -> Dict:
         """Mock hardware info for MI300X"""

     def run_with_profiling(self, executable_path: str, args: List[str] = None) -> Dict:
         """Run executable with rocprof profiling"""
         if not self.rocm_available:
+            # Caller should use get_mock_profiling_data(kernel_name, iteration) directly.
+            return {"success": False, "error": "ROCm not available; use get_mock_profiling_data(kernel_name, iteration) instead", "execution_time_ms": 0}
         try:
             if args is None:
                 args = []
+            # Run with rocprof stats timing
+            cmd = [self.rocprof_path, '--stats', '--', executable_path] + args
             result = subprocess.run(
                 cmd, capture_output=True, text=True, timeout=120, check=False)
             if result.returncode != 0:
+                detail = result.stderr.strip() or result.stdout.strip(
+                ) or "rocprof exited with a non-zero status"
                 return {
                     "success": False,
                     "error": f"Profiling failed: {detail}",
             return {"success": False, "error": f"Profiling error: {str(e)}", "execution_time_ms": 0}
     def _parse_rocprof_output(self, stdout: str, _stderr: str) -> Dict:
+        """Parse rocprof --stats CSV output (Name,Calls,TotalDurationNs,AverageNs,Percentage)."""
+        import csv
+        import io
         try:
+            metrics: Dict = {}
+            reader = csv.DictReader(io.StringIO(stdout))
+            for row in reader:
+                name = row.get("Name", "")
+                # Skip ROCm runtime helper kernels
+                if "__amd_rocclr" in name:
+                    continue
+                avg_ns_str = row.get("AverageNs", "") or ""
+                if avg_ns_str.strip():
+                    avg_ns = float(avg_ns_str)
+                    if avg_ns > 0:
+                        metrics["execution_time_ms"] = round(
+                            avg_ns / 1_000_000, 6)
+                        metrics["memory_bandwidth_gbps"] = 0.0
+                        metrics["gpu_utilization_percent"] = 0.0
+                        metrics["sq_waves"] = 0
+                        break
             if not metrics:
+                return {
+                    "success": False,
+                    "error": "rocprof output contained no parseable kernel rows",
+                    "execution_time_ms": 0,
                 }
+            metrics["success"] = True
             return metrics
+        except Exception as e:
             return {
+                "success": False,
+                "error": f"Failed to parse rocprof output: {str(e)}",
+                "execution_time_ms": 0,
             }
     def get_mock_profiling_data(self, kernel_name: str = "custom", iteration: int = 1) -> Dict:
         except (OSError, subprocess.SubprocessError):
             return self._get_mock_hardware_info()
+    def _parse_rocminfo(self, output: str) -> Dict:
+        """Parse rocminfo output to extract hardware info."""
+        info = self._get_mock_hardware_info()  # safe MI300X defaults
+        name_match = re.search(r'^\s*Name:\s+(.+)$', output, re.MULTILINE)
+        if name_match:
+            info['gpu_name'] = name_match.group(1).strip()
+        cu_match = re.search(r'^\s*Compute Unit:\s+(\d+)',
+                             output, re.MULTILINE)
+        if cu_match:
+            info['compute_units'] = int(cu_match.group(1))
+        wf_match = re.search(
+            r'^\s*Wavefront Size:\s+(\d+)', output, re.MULTILINE)
+        if wf_match:
+            info['wavefront_size'] = int(wf_match.group(1))
+        return info
     def _get_mock_hardware_info(self) -> Dict:
         """Mock hardware info for MI300X"""