# GPU Mode: Float16 Vector Addition # Usage: uv run skydiscover-run initial_program.py evaluator.py -c config.yaml -s -i 50 max_iterations: 100 checkpoint_interval: 10 log_level: INFO llm: models: - name: "gpt-5" weight: 1.0 api_base: https://api.openai.com/v1 temperature: 0.7 # top_p: 0.95 # omitted by default; some providers (e.g. Anthropic) reject both temperature and top_p max_tokens: 32000 timeout: 600 prompt: system_message: | You are an expert Triton kernel engineer. Output ONLY Python code - no explanations. REQUIRED OUTPUT STRUCTURE: 1. Imports: torch, triton, triton.language as tl 2. @triton.jit kernel function(s) 3. def custom_kernel(data) wrapper - REQUIRED entry point Task: Optimize float16 vector addition kernel. C = A + B Input: Tuple of (A, B) tensors of shape (N, N) and dtype torch.float16 Output: Tensor of shape (N, N) and dtype torch.float16 N can be: 256, 512, 1024, 2048, 4096, 8192 Optimization tips: - Block size tuning (512, 1024, 2048, 4096) - Use @triton.autotune for automatic parameter tuning - Vectorized loads for memory operations - Grid configuration for occupancy - Memory coalescing for sequential access patterns MUST use @triton.jit decorator. MUST return float16 tensor. Output complete, working code in a single ```python``` block. evaluator: timeout: 600 max_retries: 3 cascade_evaluation: true cascade_thresholds: [0.4, 0.3] diff_based_generation: true max_solution_length: 60000 random_seed: 42