# pylint: disable=broad-exception-caught import logging from ..models import OptimizerResult, AnalyzerResult, WorkloadType from ..tools.llm_client import LLMClient from ..tools.json_utils import safe_json_loads llm_client = LLMClient() def chat_complete(messages: list, temperature: float = 0.7, max_tokens: int = 4000) -> str: """Wrapper for LLM client chat completion""" return llm_client.chat_completion(messages, temperature=temperature, max_tokens=max_tokens) ALLOWED_OPTIMIZATIONS = """ You may ONLY suggest these specific, well-known AMD MI300X optimizations: 1. Shared memory tiling: Replace naive global memory access with 32x32 shared memory tiles (__shared__) 2. Block size adjustment: Change thread block size to 256 for MI300X wavefront alignment (multiple of 64) 3. Memory coalescing: Fix non-coalesced global memory access patterns (ensure stride-1 access) 4. Kernel fusion: Identify two adjacent kernels that can be merged to reduce memory round-trips 5. LDS bank conflict avoidance: Add padding to shared memory arrays to avoid 32-bank conflicts 6. Remove GPU sharding: If code splits work across GPUs due to 80GB limit, remove -- MI300X has 192GB 7. Loop unrolling: Add #pragma unroll for small fixed-size loops DO NOT invent optimizations. Stick strictly to the list above. DO NOT suggest anything you are not 100% certain will improve AMD performance. If the code is already well-optimized, say so -- fewer changes is better than wrong ones. """ SYSTEM_PROMPT = f"""You are an AMD MI300X performance engineer. You receive HIP code and apply AMD-specific optimizations. {ALLOWED_OPTIMIZATIONS} Return ONLY this JSON, no markdown: {{ "optimized_code": "the complete optimized HIP code", "changes": [ {{ "description": "Replaced global memory access with shared memory tile (32x32)", "impact": "Reduces global memory bandwidth pressure, better L2 cache utilization" }} ] }} Be conservative. 2-3 high-confidence changes beat 10 uncertain ones.""" def run(hip_code: str, analyzer_result: AnalyzerResult, iteration: int = 1, previous_feedback: str = None) -> OptimizerResult: context = f""" Optimize this HIP code for AMD MI300X. Hardware context: - MI300X: 192GB HBM3, 5.3 TB/s bandwidth, wavefront size = 64 - Workload classification: {analyzer_result.workload_type.value} - {"MEMORY-BOUND: prioritize memory coalescing and shared memory tiling" if analyzer_result.workload_type == WorkloadType.MEMORY_BOUND else "COMPUTE-BOUND: prioritize arithmetic efficiency and register usage"} """ if iteration == 2 and previous_feedback: context += f""" ITERATION 2 -- Previous optimization made performance WORSE. Profiler feedback: {previous_feedback} Try a DIFFERENT strategy. If you applied shared memory tiling, try memory coalescing instead. """ context += f"\nHIP code to optimize:\n```\n{hip_code}\n```" try: raw = chat_complete( messages=[ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": context} ], temperature=0.1, max_tokens=4096, ) data = safe_json_loads(raw) except Exception: logging.exception( "Optimizer LLM call failed; returning unmodified hip_code") # Fallback to original hip_code if LLM fails data = { "optimized_code": hip_code, "changes": [] } return OptimizerResult( optimized_code=data.get("optimized_code", hip_code), changes=data.get("changes", []), iteration=iteration, )