JustinTX
/

sky2

Model card Files Files and versions

sky2 / benchmarks /gpu_mode /grayscale /reference.py

JustinTX's picture

Add files using upload-large-folder tool

af83196 verified 20 days ago

history blame contribute delete

3.7 kB

	"""
	Reference implementation for Grayscale Triton kernel.
	Y = 0.2989 R + 0.5870 G + 0.1140 B

	Input: (H, W, 3) float32 RGB tensor
	Output: (H, W) float32 grayscale tensor
	"""

	import torch

	# ---------------------------------------------------------------------------
	# Scoring and benchmark configuration (read by shared_eval.py)
	# ---------------------------------------------------------------------------

	SCORE_SCALE = 3000.0

	# grayscale uses CUDA events timing, 0.1% rel error, 120s wall clock timeout
	BENCH_USE_CUDA_EVENTS = True
	BENCH_REL_ERROR = 0.001
	BENCH_WALL_TIMEOUT_NS = 120e9
	BENCH_NO_GRAD = False
	BENCH_MAX_REPEATS = 100
	BENCH_MAX_TIME_NS = 10e9
	BENCH_WARMUP_STYLE = 'tiny_benchmark'

	# ---------------------------------------------------------------------------
	# Test / benchmark cases
	# ---------------------------------------------------------------------------

	TEST_CASES = [
	{"size": 256, "seed": 42},
	{"size": 512, "seed": 123},
	{"size": 1024, "seed": 456},
	{"size": 2048, "seed": 789},
	]

	BENCHMARK_CASES = [
	{"size": 1024, "seed": 1001},
	{"size": 2048, "seed": 1002},
	{"size": 4096, "seed": 1003},
	{"size": 8192, "seed": 1004},
	]

	# ---------------------------------------------------------------------------
	# Reference kernel
	# ---------------------------------------------------------------------------


	def ref_kernel(data):
	"""Reference: Y = 0.2989 R + 0.5870 G + 0.1140 B"""
	rgb, output = data
	weights = torch.tensor([0.2989, 0.5870, 0.1140], device=rgb.device, dtype=rgb.dtype)
	output[...] = torch.sum(rgb * weights, dim=-1)
	return output


	def generate_input(size, seed):
	gen = torch.Generator(device="cuda")
	gen.manual_seed(seed)
	x = torch.rand(size, size, 3, device="cuda", dtype=torch.float32, generator=gen).contiguous()
	y = torch.empty(size, size, device="cuda", dtype=torch.float32).contiguous()
	return x, y


	def check_implementation(data, submission_output, rtol=1e-4, atol=1e-4):
	ref_output = ref_kernel(data)
	if submission_output.shape != ref_output.shape:
	return False, f"Shape mismatch: expected {ref_output.shape}, got {submission_output.shape}"
	if torch.allclose(submission_output, ref_output, rtol=rtol, atol=atol):
	return True, "Match"
	diff = torch.abs(submission_output.float() - ref_output.float())
	return False, f"Output mismatch: max_diff={diff.max().item():.6f}"


	# ---------------------------------------------------------------------------
	# Self-contained reference code for Modal remote execution
	# ---------------------------------------------------------------------------

	MODAL_REFERENCE_CODE = r'''
	import torch

	def ref_kernel(data):
	rgb, output = data
	weights = torch.tensor([0.2989, 0.5870, 0.1140], device=rgb.device, dtype=rgb.dtype)
	output[...] = torch.sum(rgb * weights, dim=-1)
	return output

	def generate_input(size, seed):
	gen = torch.Generator(device="cuda")
	gen.manual_seed(seed)
	x = torch.rand(size, size, 3, device="cuda", dtype=torch.float32, generator=gen).contiguous()
	y = torch.empty(size, size, device="cuda", dtype=torch.float32).contiguous()
	return x, y

	def check_implementation(data, submission_output, rtol=1e-4, atol=1e-4):
	ref_output = ref_kernel(data)
	if submission_output.shape != ref_output.shape:
	return False, f"Shape mismatch: expected {ref_output.shape}, got {submission_output.shape}"
	if torch.allclose(submission_output, ref_output, rtol=rtol, atol=atol):
	return True, "Match"
	diff = torch.abs(submission_output.float() - ref_output.float())
	return False, f"Output mismatch: max_diff={diff.max().item():.6f}"
	'''