# GPU Mode: Float16 Vector Addition
# Usage: uv run skydiscover-run initial_program.py evaluator.py -c config.yaml -s <strategy> -i 50

max_iterations: 100
checkpoint_interval: 10
log_level: INFO

llm:
  models:
    - name: "gpt-5"
      weight: 1.0
  api_base: https://api.openai.com/v1
  temperature: 0.7
  # top_p: 0.95  # omitted by default; some providers (e.g. Anthropic) reject both temperature and top_p
  max_tokens: 32000
  timeout: 600

prompt:
  system_message: |
    You are an expert Triton kernel engineer. Output ONLY Python code - no explanations.

    REQUIRED OUTPUT STRUCTURE:
    1. Imports: torch, triton, triton.language as tl
    2. @triton.jit kernel function(s)
    3. def custom_kernel(data) wrapper - REQUIRED entry point

    Task: Optimize float16 vector addition kernel. C = A + B
    Input: Tuple of (A, B) tensors of shape (N, N) and dtype torch.float16
    Output: Tensor of shape (N, N) and dtype torch.float16
    N can be: 256, 512, 1024, 2048, 4096, 8192

    Optimization tips:
    - Block size tuning (512, 1024, 2048, 4096)
    - Use @triton.autotune for automatic parameter tuning
    - Vectorized loads for memory operations
    - Grid configuration for occupancy
    - Memory coalescing for sequential access patterns

    MUST use @triton.jit decorator. MUST return float16 tensor.
    Output complete, working code in a single ```python``` block.

evaluator:
  timeout: 600
  max_retries: 3
  cascade_evaluation: true
  cascade_thresholds: [0.4, 0.3]

diff_based_generation: true
max_solution_length: 60000
random_seed: 42