File size: 3,117 Bytes
16dd578 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 | """
Reference implementation for float16 vector addition Triton kernel.
C = A + B
"""
import math
try:
import torch
except ImportError:
torch = None # Modal-only mode — functions below won't be called locally
# ---------------------------------------------------------------------------
# Reward parameters
# ---------------------------------------------------------------------------
CORRECTNESS_WEIGHT = 0.3
SPEED_WEIGHT = 1.0
SPEED_MAX_REWARD = 10.0
# ---------------------------------------------------------------------------
# Test / benchmark cases
# ---------------------------------------------------------------------------
TEST_CASES = [
{"N": 256, "seed": 42},
{"N": 512, "seed": 123},
{"N": 1024, "seed": 456},
{"N": 2048, "seed": 789},
]
BENCHMARK_CASES = [
{"N": 1024, "seed": 1001},
{"N": 2048, "seed": 1002},
{"N": 4096, "seed": 1003},
{"N": 8192, "seed": 1004},
]
# ---------------------------------------------------------------------------
# Reference kernel
# ---------------------------------------------------------------------------
def ref_kernel(data):
a, b = data
return a + b
def generate_input(N, seed):
gen = torch.Generator(device="cuda")
gen.manual_seed(seed)
a = torch.randn(N, N, device="cuda", dtype=torch.float16, generator=gen)
b = torch.randn(N, N, device="cuda", dtype=torch.float16, generator=gen)
return (a, b)
def check_implementation(data, output, rtol=1e-3, atol=1e-3):
ref_out = ref_kernel(data)
if output.shape != ref_out.shape:
return False, f"Shape mismatch: expected {ref_out.shape}, got {output.shape}"
if output.dtype != torch.float16:
return False, f"Dtype mismatch: expected float16, got {output.dtype}"
if torch.allclose(output, ref_out, rtol=rtol, atol=atol):
return True, "Match"
diff = torch.abs(output.float() - ref_out.float())
return False, f"Output mismatch: max_diff={diff.max().item():.6f}"
# ---------------------------------------------------------------------------
# Self-contained reference code for Modal execution
# ---------------------------------------------------------------------------
MODAL_REFERENCE_CODE = '''
import torch
def ref_kernel(data):
a, b = data
return a + b
def generate_input(N, seed):
gen = torch.Generator(device="cuda")
gen.manual_seed(seed)
a = torch.randn(N, N, device="cuda", dtype=torch.float16, generator=gen)
b = torch.randn(N, N, device="cuda", dtype=torch.float16, generator=gen)
return (a, b)
def check_implementation(data, output, rtol=1e-3, atol=1e-3):
ref_out = ref_kernel(data)
if output.shape != ref_out.shape:
return False, f"Shape mismatch: expected {ref_out.shape}, got {output.shape}"
if output.dtype != torch.float16:
return False, f"Dtype mismatch: expected float16, got {output.dtype}"
if torch.allclose(output, ref_out, rtol=rtol, atol=atol):
return True, "Match"
diff = torch.abs(output.float() - ref_out.float())
return False, f"Output mismatch: max_diff={diff.max().item():.6f}"
'''
|