diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/autotune_process.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/autotune_process.py new file mode 100644 index 0000000000000000000000000000000000000000..b51aae85b8778e0a3a46e37aeaa85baf8a8892ed --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/autotune_process.py @@ -0,0 +1,656 @@ +from __future__ import annotations + +import contextlib +import dataclasses +import functools +import logging +import os +import queue +import time +import warnings +from concurrent.futures import ThreadPoolExecutor +from ctypes import byref, c_size_t, c_void_p +from multiprocessing.process import BaseProcess +from multiprocessing.queues import Queue +from typing import ( + Any, + Callable, + Dict, + Iterable, + List, + Optional, + Sequence, + TYPE_CHECKING, + Union, +) + +import torch +from torch import multiprocessing +from torch._dynamo.testing import rand_strided + +from torch._inductor import ir +from torch._inductor.codecache import CUDACodeCache, DLLWrapper, PyCodeCache + +if TYPE_CHECKING: + from torch._inductor.select_algorithm import TritonTemplateCaller + +from . import config +from .utils import do_bench +from .virtualized import V + +CUDA_VISIBLE_DEVICES = "CUDA_VISIBLE_DEVICES" +EXIT_HANDLER_REGISTERED = False + +log = logging.getLogger(__name__) + + +# Used to synchronize between parent and child processes +class Ping: + pass + + +class Pong: + pass + + +@contextlib.contextmanager +def set_cuda_visible_device(device: Optional[int]): + """ + Context manager to set the CUDA_VISIBLE_DEVICES environment variable to the + specified single device. If device is None, don't manipulate the environment. + """ + if device is None: + yield + return + + current = os.environ.get(CUDA_VISIBLE_DEVICES) + os.environ[CUDA_VISIBLE_DEVICES] = str(device) + try: + yield + finally: + if current is None: + del os.environ[CUDA_VISIBLE_DEVICES] + else: + os.environ[CUDA_VISIBLE_DEVICES] = current + + +@dataclasses.dataclass +class TuningProcess: + """ + Abstraction for launching a helper process to benchmark kernels. Spawns + the parent process and uses multiprocessing queues to send benchmark + requests and return results. + """ + + device: Optional[int] = None + process: Optional[BaseProcess] = None + request_queue: Optional[Queue[Any]] = None + response_queue: Optional[Queue[Any]] = None + + @staticmethod + def process_main( + request_queue: Queue[Any], + response_queue: Queue[Any], + ) -> None: + """ + Entry point for the child process. + """ + log.debug( + "Entering TuningProcess child. Visible devices = %s", + os.environ.get(CUDA_VISIBLE_DEVICES), + ) + try: + TuningProcess.workloop(request_queue, response_queue) + except Exception as ex: + log.exception("Exception in TuningProcess: %s", ex) + + @staticmethod + def workloop(request_queue: Queue[Any], response_queue: Queue[Any]) -> None: + """ + Work loop for the benchmarking subprocess. + """ + while True: + obj = request_queue.get() + + if obj is None: + break # None is a sentinel for the child to terminate + elif isinstance(obj, Ping): + response_queue.put(Pong()) + elif isinstance(obj, BenchmarkRequest): + response_queue.put(obj.benchmark()) + else: + raise RuntimeError(f"Invalid request type {type(obj)}") + + def valid(self) -> bool: + """ + True if the sub-process has been initialized. + """ + return ( + self.process is not None + and self.request_queue is not None + and self.response_queue is not None + ) + + def clear(self) -> None: + """ + Reset to an uninitialized state. + """ + self.process = self.request_queue = self.response_queue = None + + def initialize(self) -> None: + """ + Create child process, request/response queues, and do the warm up. + Set the environment to make only the provided GPU device visible + to the process. + """ + if self.valid(): + return + + # cuda runtime does not work with "fork", use "spawn" to start processes. + ctx = multiprocessing.get_context("spawn") + self.request_queue = ctx.Queue() + self.response_queue = ctx.Queue() + + self.process = ctx.Process( + target=self.process_main, + args=( + self.request_queue, + self.response_queue, + ), + ) + assert self.process is not None + with set_cuda_visible_device(self.device): + self.process.start() + + def put(self, obj: Any) -> None: + """ + Push a work item to the child process. + """ + # In case of a prior crash, ensure the subprocess is running + self.initialize() + assert self.request_queue is not None + self.request_queue.put(obj) + + def get(self) -> Any: + """ + Get a response from the child process. + """ + assert self.process is not None + assert self.response_queue is not None + while True: + try: + return self.response_queue.get(timeout=1.0) + except queue.Empty: + status = self.process.exitcode + if status is None: + # child process is still running + continue + # child process crashed + self.clear() + raise + + def terminate(self) -> None: + """ + Signal the child process to terminate. + """ + if self.valid(): + assert self.process is not None + assert self.request_queue is not None + self.request_queue.put(None) + + def wait(self) -> None: + """ + Wait for the child process to exit. + """ + if self.process is not None: + self.process.join() + self.clear() + + +@dataclasses.dataclass +class TuningProcessPool: + """ + Maintains a pool of TuningProcesses to benchmark kernels in parallel + across devices. By default, we create one TuningProcess per device and + set the sub-process environment to make only that device visible. + """ + + processes: Optional[queue.Queue[TuningProcess]] = None + executor: Optional[ThreadPoolExecutor] = None + + def initialize(self) -> None: + """ + Start the child processes. + """ + assert (self.processes is None) == (self.executor is None) + if self.processes is not None: + return + + devices = self.get_device_list() + log.debug("Sub-process autotune device list: %s", devices) + + # Launch the child processes and push a msg to "warm up" + self.processes = queue.Queue() + for device in devices: + p = TuningProcess(device=device) + p.initialize() + p.put(Ping()) + self.processes.put(p) + + # Wait for the initialization to finish + for p in self.processes.queue: + assert isinstance(p.get(), Pong) + + # Use a thread pool to manage distributing work to the subprocesses. + # Threads block on an available process, so it makes sense to match + # the number of threads with the number of devices. + self.executor = ThreadPoolExecutor(max_workers=len(devices)) + + # Register the exit handler for the parent process so it will terminate + # the child processes. + global EXIT_HANDLER_REGISTERED + if not EXIT_HANDLER_REGISTERED: + EXIT_HANDLER_REGISTERED = True + import atexit + + atexit.register(self.terminate) + + def get_device_list(self) -> Sequence[Optional[int]]: + """ + Gather the list of devices to be used in the pool. + """ + if not config.autotune_multi_device: + # Don't use multiple devices + return [None] + + count = torch.cuda.device_count() + + # If the user specified the visible devices in the env, use those. + if CUDA_VISIBLE_DEVICES in os.environ: + devices = [int(d) for d in os.environ[CUDA_VISIBLE_DEVICES].split(",")] + assert len(devices) <= count + return devices + + return list(range(count)) + + def terminate(self) -> None: + """ + Signal all child processes to terminate. + """ + if self.executor is not None: + self.executor.shutdown() + self.executor = None + + if self.processes is not None: + for p in self.processes.queue: + p.terminate() + for p in self.processes.queue: + p.wait() + self.processes = None + + def target(self, choice: TritonTemplateCaller) -> float: + """ + Entry point for the thread-pool helper threads: Wait for an open TuningProcess, + remove it from the queue, execute the benchmark in that subprocess, and return + the TuningProcess to the queue. + """ + assert choice.bmreq is not None + assert self.processes is not None + + process = self.processes.get() + process.put(choice.bmreq) + try: + return process.get() + except queue.Empty: + warnings.warn( + f"Failed to benchmark choice '{choice}'. It will be ignored. " + "Please debug the root cause in case the choice can bring perf gains." + ) + # set to INF so this choice will be ignored + return float("inf") + finally: + self.processes.put(process) + + def benchmark( + self, + choices: List[TritonTemplateCaller], + ) -> Dict[TritonTemplateCaller, float]: + """ + Benchmark each choice in a separate process. + """ + assert self.processes is not None, "Tuning process pool is not initialized" + assert self.executor is not None + + results = {} + + # Use a ThreadExecutorPool to spread the work across the subprocesses and + # to grab subprocesses as soon as they're free. + for choice, result in zip(choices, self.executor.map(self.target, choices)): + results[choice] = result + + return results + + +tuning_pool = TuningProcessPool() + + +LayoutOrBuffer = Union[ir.Layout, ir.Buffer] + + +@dataclasses.dataclass +class TensorMeta: + device: torch.device + dtype: torch.dtype + sizes: torch._prims_common.ShapeType + strides: torch._prims_common.StrideType + offset: int + + @classmethod + def from_irnodes( + cls, irnodes: Union[LayoutOrBuffer, Sequence[LayoutOrBuffer]] + ) -> Union[TensorMeta, List[TensorMeta]]: + if isinstance(irnodes, Sequence): + result: List[Any] = [cls.from_irnodes(x) for x in irnodes] + assert all(isinstance(x, TensorMeta) for x in result) + return result + + node = irnodes + if isinstance(node, ir.Layout): + node = ir.Buffer("fake", node) + + dtype = node.get_dtype() + assert dtype is not None + + return TensorMeta( + device=node.get_device(), + dtype=dtype, + sizes=V.graph.sizevars.size_hints( + node.get_size(), + fallback=config.unbacked_symint_fallback, + ), + strides=V.graph.sizevars.size_hints( + node.get_stride(), + fallback=config.unbacked_symint_fallback, + ), + offset=V.graph.sizevars.size_hint( + node.get_layout().offset, + fallback=config.unbacked_symint_fallback, + ), + ) + + def to_tensor(self) -> torch.Tensor: + return rand_strided( + self.sizes, + self.strides, + device=self.device, + dtype=self.dtype, + extra_size=self.offset, + ) + + +@dataclasses.dataclass +class BenchmarkRequest: + """ + Only handle triton template benchmark for now. The extern kernel benchmark + can be done inside the same process since they usually don't cause crash. + + Important: Instances of this class and subclasses have to be serializable + across process boundaries. Do not put CUDA Tensors in here! + """ + + def __init__( + self, + kernel_name: str, + input_tensor_meta: Union[TensorMeta, List[TensorMeta]], + output_tensor_meta: Union[TensorMeta, List[TensorMeta]], + extra_args: Iterable[Any], + ): + # the kernel name defined in the module + self.kernel_name = kernel_name + + if isinstance(input_tensor_meta, TensorMeta): + input_tensor_meta = [input_tensor_meta] + self.input_tensor_meta = input_tensor_meta + + if isinstance(output_tensor_meta, (tuple, list)): + assert len(output_tensor_meta) == 1 + output_tensor_meta = output_tensor_meta[0] + self.output_tensor_meta = output_tensor_meta + + self.extra_args = extra_args + + def make_run_fn( + self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor + ) -> Callable[[], None]: + raise NotImplementedError() + + def cleanup_run_fn(self) -> None: + pass + + def benchmark( + self, + *input_tensors: torch.Tensor, + output_tensor: Optional[torch.Tensor] = None, + ) -> float: + debug = log.isEnabledFor(logging.DEBUG) + if debug: + start_ts = time.time() + + # create args and out tensor + if output_tensor is None: + assert len(input_tensors) == 0 + input_tensors = tuple(x.to_tensor() for x in self.input_tensor_meta) + output_tensor = self.output_tensor_meta.to_tensor() + + if debug: + create_tensor_elapse = time.time() - start_ts # type: ignore[possibly-undefined] + start_ts = time.time() + + fn = self.make_run_fn(*input_tensors, output_tensor=output_tensor) + + if debug: + load_elapse = time.time() - start_ts # type: ignore[possibly-undefined] + start_ts = time.time() + + out = do_bench(fn) + torch.cuda.synchronize() # shake out any CUDA errors + + if debug: + bench_elapse = time.time() - start_ts # type: ignore[possibly-undefined] + log.debug( + "InChildProcess %s: load %f, create tensor %f, bench %f", + str(self), + load_elapse, # type: ignore[possibly-undefined] + create_tensor_elapse, # type: ignore[possibly-undefined] + bench_elapse, + ) + self.cleanup_run_fn() + return out + + +class TestBenchmarkRequest(BenchmarkRequest): + """ + Supports unit testing. Defined in this file so that the TuningProcess + sub-process knows how to unpickle these objects. + """ + + def __init__(self, value: Optional[float] = None) -> None: + self.value = value + + def benchmark( + self, *input_tensors: torch.Tensor, output_tensor: Optional[torch.Tensor] = None + ) -> float: + if self.value is None: + raise Exception("Failed to run") + return self.value + + +class TritonBenchmarkRequest(BenchmarkRequest): + # Important: Instances of this class have to be serializable + # across process boundaries. Do not put CUDA Tensors in here! + + def __init__( + self, + kernel_name: str, + input_tensor_meta: Union[TensorMeta, List[TensorMeta]], + output_tensor_meta: Union[TensorMeta, List[TensorMeta]], + extra_args: Iterable[Any], + module_path: str, # the path of the module defining the triton kernel + module_cache_key: str, + grid: List[int], + num_stages: int, + num_warps: int, + matrix_instr_nonkdim: int = 0, # only used for hip to choose the shape of mfma instruction. + ): + super().__init__(kernel_name, input_tensor_meta, output_tensor_meta, extra_args) + self.module_path = module_path + self.module_cache_key = module_cache_key + self.grid = grid + self.num_stages = num_stages + self.num_warps = num_warps + self.matrix_instr_nonkdim = matrix_instr_nonkdim + + def make_run_fn( + self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor + ) -> Callable[[], None]: + mod = PyCodeCache.load_by_key_path(self.module_cache_key, self.module_path) + log.debug( + "benchmark module key: %s, path: %s", + self.module_cache_key, + self.module_path, + ) + + run_method = getattr(mod, self.kernel_name).run + extra_args = list(self.extra_args) + + # Newer version of triton add warmup argument to JITFunction.run. + # This code handles backward-compatibility. + warmup_arg = {} + import inspect + + if "warmup" in inspect.signature(run_method).parameters: + warmup_arg["warmup"] = False + + if torch.version.hip and self.matrix_instr_nonkdim != 0: + return functools.partial( + run_method, + *input_tensors, + output_tensor, + *self.extra_args, + grid=self.grid, + **warmup_arg, + num_stages=self.num_stages, + num_warps=self.num_warps, + matrix_instr_nonkdim=self.matrix_instr_nonkdim, + ) + else: + return functools.partial( + run_method, + *input_tensors, + output_tensor, + *self.extra_args, + grid=self.grid, + **warmup_arg, + num_stages=self.num_stages, + num_warps=self.num_warps, + ) + + def __str__(self) -> str: + return f"{self.kernel_name=}, {self.module_path=}, {self.module_cache_key=}" + + +class CUDABenchmarkRequest(BenchmarkRequest): + # Important: Instances of this class have to be serializable + # across process boundaries. Do not put CUDA Tensors in here! + + def __init__( + self, + kernel_name: str, + input_tensor_meta: Union[TensorMeta, List[TensorMeta]], + output_tensor_meta: Union[TensorMeta, List[TensorMeta]], + extra_args: Iterable[Any], + source_code: str, + ): + super().__init__(kernel_name, input_tensor_meta, output_tensor_meta, extra_args) + self.source_code = source_code + self.workspace_size: int = 0 + self.workspace: Optional[torch.Tensor] = None + self.DLL: Optional[DLLWrapper] = None + self.hash_key: str = "" + self.source_file: str = "" + self.hash_key, self.source_file = CUDACodeCache.write(self.source_code, "so") + + def precompile(self): + # Prepopulate CUDACodeCache + # may happen in separate Threadpool + log.debug("Precompiling %s", self) + CUDACodeCache.load(self.source_code, "so") + log.debug("Done precompiling %s", self) + + def make_run_fn( + self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor + ) -> Callable[[], None]: + self.DLL, self.hash_key, self.source_file = CUDACodeCache.load( + self.source_code, "so" + ) + args = [ + c_void_p(tensor.data_ptr()) + for tensor in list(input_tensors) + [output_tensor] + ] + log.debug( + "make_run_fn: self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%s", + self.kernel_name, + self.source_file, + self.hash_key, + self.DLL, + args, + self.extra_args, + ) + run_method = getattr(self.DLL, self.kernel_name) + stream_ptr = c_void_p(torch.cuda.current_stream().cuda_stream) + + # Retrieve workspace_size and initialize workspace. + c_workspace_size = c_size_t() + run_method( + *args, # input ptrs and output ptrs + *self.extra_args, + byref( + c_workspace_size + ), # set workspace size ptr to retrieve workspace size + None, # null workspace ptr + stream_ptr, + ) + self.workspace_size = c_workspace_size.value + # TODO: Support non-zero workspace_size. + assert self.workspace_size == 0, ( + "Things need to be fixed to support non-zero workspace_size: " + "1) max autotune cache needs to store workspace size; " + "2) memory allocation needs to allocate / deallocate workspace correctly; " + ) + + # Generate partial function. + return functools.partial( + run_method, + *args, + *self.extra_args, + None, # null workspace size ptr + None, # set workspace ptr, TODO: update it to a real ptr if workspace_size > 0 + stream_ptr, + ) + + def cleanup_run_fn(self) -> None: + if self.DLL is not None: + self.DLL.close() + self.workspace = None + + def __str__(self) -> str: + return f"{self.kernel_name=}, {self.source_file=}, {self.hash_key=}" + + +def benchmark_in_sub_process( + choices: List[TritonTemplateCaller], +) -> Dict[TritonTemplateCaller, float]: + """ + Do benchmarking in a subprocess and return the perf number (latency). + """ + return tuning_pool.benchmark(choices) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/coordinate_descent_tuner.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/coordinate_descent_tuner.py new file mode 100644 index 0000000000000000000000000000000000000000..d0e1efd1f350480c10114fdf140757893630085a --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/coordinate_descent_tuner.py @@ -0,0 +1,315 @@ +import copy +import itertools +import logging +from typing import Callable, Optional + +from torch.utils._triton import has_triton +from .utils import red_text, triton_config_to_hashable + +if has_triton(): + import triton +else: + triton = None + +from . import config as inductor_config + +log = logging.getLogger(__name__) + + +def get_field(config, name): + if name == "num_warps": + return config.num_warps + elif name == "num_stages": + return config.num_stages + else: + return config.kwargs.get(name, None) + + +def set_field(config, name, value): + if name == "num_warps": + config.num_warps = value + elif name == "num_stages": + config.num_stages = value + else: + config.kwargs[name] = value + + +class CoordescTuner: + """ + The coordinate descent tuner. Tune one field/coordinate at a time. + + TODO will it be necessary to tune multiple fields simultaneously. + + + TODO: what if both increasing and decreasing a field can improve perf. + i.e., there are multiple local optima.. + """ + + def __init__(self, is_mm=False, name="unknown", size_hints=None): + self.is_mm = is_mm # we will tune num_stages for mm + self.cached_benchmark_results = {} + self.name = name + self.size_hints = size_hints + + def get_xmax(self): + xmax = inductor_config.triton.max_block["X"] + if self.size_hints and len(self.size_hints) > 0: + xmax = min(xmax, self.size_hints[0]) + return xmax + + def get_ymax(self): + ymax = inductor_config.triton.max_block["Y"] + if self.size_hints and len(self.size_hints) > 1: + ymax = min(ymax, self.size_hints[1]) + return ymax + + def get_zmax(self): + zmax = inductor_config.triton.max_block["Z"] + if self.size_hints and len(self.size_hints) > 2: + zmax = min(zmax, self.size_hints[2]) + return zmax + + def get_rmax(self): + if self.size_hints and len(self.size_hints) > 0: + return self.size_hints[-1] # the last one is for reduction + else: + # large enough. We should not pick this large RBLOCK anyway + return 2**30 + + def get_warpsmax(self): + # Currently, CUDA has a maximum of 1024 threads, so 32 is the max + # number of warps. + return 1024 // 32 + + def cache_benchmark_result(self, config, timing): + self.cached_benchmark_results[triton_config_to_hashable(config)] = timing + + def lookup_in_cache(self, config): + return self.cached_benchmark_results.get(triton_config_to_hashable(config)) + + def call_func(self, func, config): + found = self.lookup_in_cache(config) + if found is not None: + log.debug(" CACHED") + return found + timing = func(config) + self.cache_benchmark_result(config, timing) + return timing + + @property + def tunable_fields(self): + out = [ + "XBLOCK", + "YBLOCK", + "ZBLOCK", + # NOTE: we should not tune RBLOCK for persistent reduction. + # We rely on the fact that persistent reduction's triton.Config + # does not have the RBLOCK field to guarantee that. + "RBLOCK", + # the following 3 are for mm + "BLOCK_M", + "BLOCK_N", + "BLOCK_K", + "num_warps", + ] + if self.is_mm: + out.append("num_stages") + + return out + + def value_too_large(self, name, val): + if name == "XBLOCK": + return val > self.get_xmax() + if name == "YBLOCK": + return val > self.get_ymax() + if name == "ZBLOCK": + return val > self.get_zmax() + if name == "RBLOCK": + return val > self.get_rmax() + if name == "num_warps": + return val > self.get_warpsmax() + + return False + + def get_neighbour_values(self, name, orig_val, radius=1, include_self=False): + """ + Get neighbour values in 'radius' steps. The original value is not + returned as it's own neighbour. + """ + assert radius >= 1 + + def update(cur_val, inc=True): + if name == "num_stages": + if inc: + return cur_val + 1 + else: + return cur_val - 1 + else: + if inc: + return cur_val * 2 + else: + return cur_val // 2 + + out = [] + # increment loop + cur_val = orig_val + for _ in range(radius): + cur_val = update(cur_val, True) + if self.value_too_large(name, cur_val): + break + out.append(cur_val) + + # decrement loop + cur_val = orig_val + for _ in range(radius): + cur_val = update(cur_val, False) + if cur_val <= 0: + break + out.append(cur_val) + + if include_self: + out.append(orig_val) + return out + + @staticmethod + def has_improvement(baseline, test): + threshold = 0.001 # 0.1% + return test is not None and test < baseline * (1 - threshold) + + def check_all_tuning_directions( + self, + func: Callable[["triton.Config"], float], + best_config, + best_timing, + ): + """ + Check all directions. We only do this once the regular coordinate + descent tuning find no better choices any more. + We only have a few tunable fields, so this should be fine. + """ + candidate_values_list = [] + effective_fields = [] + for field in self.tunable_fields: + old_value = get_field(best_config, field) + if old_value is None: + continue + candidate_values = self.get_neighbour_values( + field, + old_value, + radius=inductor_config.coordinate_descent_search_radius, + include_self=True, + ) + candidate_values_list.append(candidate_values) + effective_fields.append(field) + + choices = itertools.product(*candidate_values_list) + improved = False + for choice in choices: + assert len(choice) == len(effective_fields) + candidate_config = copy.deepcopy(best_config) + for new_val, field in zip(choice, effective_fields): + set_field(candidate_config, field, new_val) + cmp_res, candidate_timing = self.compare_config( + func, candidate_config, best_config, best_timing + ) + if cmp_res: + improved = True + best_config = candidate_config + best_timing = candidate_timing + + return improved, best_config, best_timing + + def compare_config(self, func, candidate_config, best_config, best_timing): + """ + Check if candidate_config is better than best_config. + + Return a touple of (compare_result, candidate_timing). + compare_result is true iff candidate_config is better. + """ + log.debug("Try config %s", candidate_config) + try: + candidate_timing = self.call_func(func, candidate_config) + except Exception as e: + log.debug("Got exception %s", e) + return False, float("inf") + + if self.has_improvement(best_timing, candidate_timing): + log.debug( + "Tune from %s %f -> %s %f", + best_config, + best_timing, + candidate_config, + candidate_timing, + ) + + return True, candidate_timing + return False, candidate_timing + + def autotune( + self, + func: Callable[["triton.Config"], float], + baseline_config: "triton.Config", + baseline_timing: Optional[float] = None, + ) -> "triton.Config": + if baseline_timing is None: + baseline_timing = self.call_func(func, baseline_config) + + log.debug("= Do coordinate descent tuning for %s =", self.name) + log.debug( + "Baseline Config %s, baseline timing %f", baseline_config, baseline_timing + ) + improved = True + best_config = baseline_config + best_timing = baseline_timing + tunable_fields = self.tunable_fields + + while improved: + improved = False + + for name in tunable_fields: + cur_val = get_field(best_config, name) + # some kernel don't have RBLOCK/YBLOCK/ZBLOCK. So cur_val may be None + if cur_val is None: + continue + + # It's possible that candidate_values is empty. + # E.g., if XBLOCK is 1 initially and size_hint for x is also 1. + # We would not try either larger or smaller XBLOCK in this case. + candidate_values = self.get_neighbour_values(name, cur_val) + + for next_val in candidate_values: + candidate_config = copy.deepcopy(best_config) + set_field(candidate_config, name, next_val) + + cmp_res, candidate_timing = self.compare_config( + func, candidate_config, best_config, best_timing + ) + if cmp_res: + improved = True + best_config, best_timing = candidate_config, candidate_timing + + if not improved and inductor_config.coordinate_descent_check_all_directions: + old_best_timing = best_timing + improved, best_config, best_timing = self.check_all_tuning_directions( + func, best_config, best_timing + ) + + if improved: + msg = red_text( + "Coordinate descend tuning found improvement of %.3fx by looking in all directions." + ) + log.debug( + msg, + old_best_timing / best_timing, + ) + + log.debug( + "Improve from %s %f -> %s %f, %.3fx", + baseline_config, + baseline_timing, + best_config, + best_timing, + baseline_timing / best_timing, + ) + + return best_config diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/debug.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/debug.py new file mode 100644 index 0000000000000000000000000000000000000000..3929c22fdc38ee25696259393636ba752081660b --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/debug.py @@ -0,0 +1,655 @@ +import collections +import contextlib +import cProfile +import dataclasses +import functools +import itertools +import logging +import os +import os.path +import pickle +import pstats +import shutil +import subprocess +from typing import Any, Dict, List, Optional +from unittest.mock import patch + +from functorch.compile import draw_graph, get_aot_graph_name, get_graph_being_compiled + +import torch +from torch import fx as fx + +from torch._dynamo.repro.after_aot import save_graph_repro, wrap_compiler_debug +from torch._dynamo.utils import get_debug_dir +from torch.fx.graph_module import GraphModule +from torch.fx.passes.shape_prop import _extract_tensor_metadata, TensorMetadata +from torch.fx.passes.tools_common import legalize_graph +from torch.utils._pytree import tree_map + +from . import config, ir # noqa: F811, this is needed +from .scheduler import ( + BaseSchedulerNode, + FusedSchedulerNode, + NopKernelSchedulerNode, + OutputNode, + SchedulerNode, +) +from .virtualized import V + +log = logging.getLogger(__name__) + +SchedulerNodeList = List[Any] +BufMeta = collections.namedtuple("BufMeta", ["name", "n_origin"]) +GRAPHVIZ_COMMAND_SCALABLE = ["dot", "-Gnslimit=2", "-Gnslimit1=2", "-Gmaxiter=5000"] + + +@functools.lru_cache(None) +def has_dot() -> bool: + try: + subprocess.check_output(["which", "dot"], stderr=subprocess.PIPE) + return True + except subprocess.SubprocessError: + return False + + +def draw_buffers(nodes: List[BaseSchedulerNode], print_graph=False, fname=None): + """ + Draw a graph in fname.svg. + """ + if not has_dot(): + log.warning("draw_buffers() requires `graphviz` package") + return + + if fname is None: + fname = get_graph_being_compiled() + + graph = create_fx_from_snodes(nodes) + + for node in graph.nodes: + if "fusion_meta" not in node.meta: + continue + group = node.meta["fusion_meta"].group + if isinstance(group, tuple): + if isinstance(group[1], int): + group = (group[1],) + else: + group = group[1] + + # gather meta data + dtype = None + if isinstance(node, ir.ComputedBuffer): + dtype = node.data.dtype + + metadata = TensorMetadata(group, dtype, None, None, None, None, None) # type: ignore[arg-type] + node.meta["tensor_meta"] = metadata + + if print_graph: + print(graph) + + gm = GraphModule({}, graph) + legalize_graph(gm) + gm.graph.lint() + draw_graph( + gm, fname, clear_meta=False, dot_graph_shape=config.trace.dot_graph_shape + ) + + +def create_fx_from_snodes(snodes: List[BaseSchedulerNode]) -> fx.Graph: + """ + Creates a FX Graph from a list of SchedulerNode objects. + """ + + def get_fake_func(name): + def func1(*args): + return 0 + + func1.__name__ = name + return func1 + + FusionMeta = collections.namedtuple("FusionMeta", ["group", "snode", "type"]) + + buf_to_fx_node = {} + graph = torch.fx.Graph() + first_node = None + + outputs = [] + group: Any = None + # create call_function node for each Buffer and Kernel + for snode in snodes: + if snode.is_extern(): + node_type = "extern" + group = node_type + elif snode.is_template(): + node_type = "template" + group = node_type + elif isinstance(snode, NopKernelSchedulerNode): + node_type = "nop" + group = node_type + elif isinstance(snode, SchedulerNode): + node_type = "compute" + group = snode.group + elif isinstance(snode, FusedSchedulerNode): + node_type = "fused" + group = snode.group + else: + raise RuntimeError("Unknown node type") + + fused_name = torch._inductor.utils.get_fused_kernel_name( + snode.get_nodes(), "original_aten" + ) + func_name = f"{node_type}: {fused_name}" + node_func = get_fake_func(func_name) + kwargs = {} + if hasattr(snode, "get_device"): + kwargs = {"device": snode.get_device()} + fx_node = graph.call_function(node_func, args=(), kwargs=kwargs) + + def in_output(snode): + if isinstance(snode, FusedSchedulerNode): + return any(in_output(x) for x in snode.snodes) + return any(isinstance(user.node, OutputNode) for user in snode.users) + + if in_output(snode): + outputs.append(fx_node) + name = snode.get_name() + fx_node.name = name + + fx_node.meta["fusion_meta"] = FusionMeta(group, snode, node_type) + + if isinstance(snode, FusedSchedulerNode): + for x in snode.snodes: + buf_to_fx_node[x.get_name()] = fx_node + buf_to_fx_node[name] = fx_node + + if first_node is None: + first_node = fx_node + + # create edges between nodes + for snode in snodes: + name = snode.get_name() + deps = snode.read_writes.reads + + fx_node = buf_to_fx_node[name] + new_args = [] + for dep in deps: + if dep.name in buf_to_fx_node: + dep_node = buf_to_fx_node[dep.name] + else: + with graph.inserting_before(first_node): + dep_node = graph.placeholder(dep.name) + buf_to_fx_node[dep.name] = dep_node + new_args.append(dep_node) + + fx_node.args = tuple(new_args) + + graph.output(outputs[0] if len(outputs) == 1 else tuple(outputs)) + return graph + + +def update_orig_fx_node_name_to_buf_name( + nodes: SchedulerNodeList, + node_name_to_buf_name: Dict[str, str], + parent_buf_name: Optional[str] = None, + n_origins: int = 0, +): + if nodes is None: + return + for node in nodes: + # for FusedSchedulerNode, traverse recursively into get_nodes() + buf_name = node.get_name() + children_nodes = node.get_nodes() + if children_nodes is not None and len(children_nodes) > 1: + update_orig_fx_node_name_to_buf_name( + children_nodes, + node_name_to_buf_name, + buf_name if parent_buf_name is None else parent_buf_name, + ) + continue + else: + assert len(children_nodes) == 1 and children_nodes[0] == node + + ir_node = node.node + if ir_node is None or ir_node.origins is None: + continue + for origin in ir_node.origins: + node_name = origin.name + # when buf1 and buf2 both have origin=node1 + # we draw node1 according to buf1 + if node_name not in node_name_to_buf_name: + node_name_to_buf_name[node_name] = ( + buf_name if parent_buf_name is None else parent_buf_name + ) + + +def get_node_name_to_buf_meta(node_name_to_buf_name: Dict[str, str]): + buf_name_to_n_node = {} + for node_name, buf_name in node_name_to_buf_name.items(): + if buf_name not in buf_name_to_n_node: + buf_name_to_n_node[buf_name] = {node_name} + else: + buf_name_to_n_node[buf_name].add(node_name) + + node_name_to_buf_meta = {} + for node_name, buf_name in node_name_to_buf_name.items(): + n_node = len(buf_name_to_n_node[buf_name]) + node_name_to_buf_meta[node_name] = BufMeta(buf_name, n_node) + return node_name_to_buf_meta + + +def annotate_orig_fx_with_snodes( + gm: torch.fx.GraphModule, snodes: SchedulerNodeList +) -> None: + """ + Creates a FX Graph from a list of SchedulerNode objects. + """ + node_name_to_buf_name: Dict[str, str] = {} + update_orig_fx_node_name_to_buf_name(snodes, node_name_to_buf_name) + if node_name_to_buf_name is None: + return + node_name_to_buf_meta = get_node_name_to_buf_meta(node_name_to_buf_name) + for node in gm.graph.nodes: + if node.name in node_name_to_buf_meta: + node.meta["buf_meta"] = node_name_to_buf_meta.get(node.name) + + +@contextlib.contextmanager +def enable_aot_logging(): + compile_debug = os.environ.get("TORCH_COMPILE_DEBUG", "0") == "1" + + import torch._functorch.aot_autograd + + log = logging.getLogger(torch._functorch.aot_autograd.__name__) + + stack = contextlib.ExitStack() + if not compile_debug: + try: + yield + finally: + stack.close() + return + + # Enable all graphs to be logged to a file by setting the flags to True + # and the log level of the file logger to DEBUG + stack.enter_context(patch("functorch.compile.config.debug_partitioner", True)) + + path = os.path.join(get_debug_dir(), "torchinductor") + os.makedirs(path, exist_ok=True) + + fh = logging.FileHandler( + os.path.join( + path, + f"aot_{get_aot_graph_name()}_debug.log", + ) + ) + fh.setLevel(logging.DEBUG) + fh.setFormatter( + logging.Formatter("[%(filename)s:%(lineno)d %(levelname)s] %(message)s") + ) + log.addHandler(fh) + try: + yield + finally: + log.removeHandler(fh) + stack.close() + + +class DebugContext: + _counter = itertools.count() + + @staticmethod + def wrap(fn): + @functools.wraps(fn) + def inner(*args, **kwargs): + with DebugContext(): + return fn(*args, **kwargs) + + return wrap_compiler_debug(inner, compiler_name="inductor") + + @staticmethod + def create_debug_dir(folder_name: str) -> Optional[str]: + debug_dir = config.trace.debug_dir or get_debug_dir() + for n in DebugContext._counter: + dirname = os.path.join( + debug_dir, + "torchinductor", + f"{folder_name}.{n}", + ) + if not os.path.exists(dirname): + os.makedirs(dirname) + return dirname + return None + + def __init__(self): + self._prof = None + self._path = None + self._stack = contextlib.ExitStack() + + def copy(self, new_path: str): + if not self._path: + return + assert new_path.endswith(".debug"), new_path + if os.path.exists(new_path): + shutil.rmtree(new_path) + try: + shutil.copytree(self._path, new_path) + self._path = new_path + except OSError: + log.warning( + "Failed to copy debug files from %s to %s", self._path, new_path + ) + pass + + def fopen(self, filename: str, write_mode: str = "w", *args, **kwargs): + assert self._path + return open(os.path.join(self._path, filename), write_mode, *args, **kwargs) + + @contextlib.contextmanager + def fopen_context(self, filename: str, write_mode: str = "w", *args, **kwargs): + assert self._path + with open(os.path.join(self._path, filename), write_mode, *args, **kwargs) as f: + yield f + + def filename(self, suffix: str): + assert self._path + return os.path.join(self._path, suffix) + + def upload_tar(self): + if config.trace.upload_tar is not None: + import tarfile + + assert self._path + tar_file = os.path.join( + self._path, f"{os.path.basename(self._path)}.tar.gz" + ) + with tarfile.open(tar_file, "w:gz") as tar: + tar.add(self._path, arcname=os.path.basename(self._path)) + config.trace.upload_tar(tar_file) + + def __enter__(self): + if config.debug: + log = logging.getLogger("torch._dynamo") + prev_level = log.level + log.setLevel(logging.DEBUG) + + def reset_log_level(level): + log.setLevel(level) + + self._stack.callback(reset_log_level, prev_level) + + self._stack.enter_context(V.set_debug_handler(self)) + + if not config.trace.enabled: + return + + self._path = self.create_debug_dir(get_aot_graph_name()) + + if config.trace.debug_log: + self._setup_log_capture("debug.log", logging.DEBUG) + if config.trace.info_log: + self._setup_log_capture("info.log", logging.INFO) + if config.trace.compile_profile: + self._prof = cProfile.Profile() + self._prof.enable() + + def _setup_log_capture(self, filename: str, level: int): + log = logging.getLogger("torch._inductor") + fd = self._stack.enter_context(self.fopen(filename)) + ch = logging.StreamHandler(fd) + ch.setLevel(level) + ch.setFormatter( + logging.Formatter("[%(filename)s:%(lineno)d %(levelname)s] %(message)s") + ) + log.addHandler(ch) + log.setLevel(min(log.level, level)) + self._stack.callback(log.removeHandler, ch) + + def __exit__(self, exc_type, exc_val, exc_tb): + if self._prof: + self._prof.disable() + self._save_profile_data() + + if self._path: + self.upload_tar() + log.warning("%s debug trace: %s", get_graph_being_compiled(), self._path) + self._stack.close() + + def _save_profile_data(self): + assert self._prof + self._prof.dump_stats(self.filename("compile.prof")) + with self.fopen("compile.stats") as fd: + stats = pstats.Stats(self._prof, stream=fd) + stats.strip_dirs() + stats.sort_stats("cumtime") + stats.print_stats(100) + stats.sort_stats("tottime") + stats.print_stats(100) + + def __getattr__(self, name): + if config.trace.enabled and getattr(config.trace, name): + try: + return getattr(DebugFormatter(self), name) + except Exception: + log.warning("Ignoring exception in debug code", exc_info=True) + else: + + def ignored(*args, **kwargs): + pass + + return ignored + + +class DebugFormatter: + def __init__(self, handler): + self.fopen = handler.fopen + self.fopen_context = handler.fopen_context + self.filename = handler.filename + self.handler = handler + + def fx_graph(self, gm: torch.fx.GraphModule, inputs: List[torch.Tensor]): + with self.fopen("fx_graph_runnable.py") as fd: + save_graph_repro(fd, gm, inputs, "inductor") + + with self.fopen("fx_graph_readable.py") as fd: + fd.write(gm.print_readable(print_output=False)) + + def fx_graph_transformed( + self, gm: torch.fx.GraphModule, inputs: List[torch.Tensor] + ): + with self.fopen("fx_graph_transformed.py") as fd: + fd.write(gm.print_readable(print_output=False)) + + def ir_pre_fusion(self, nodes: SchedulerNodeList): + self._write_ir("ir_pre_fusion.txt", nodes) + + def ir_post_fusion(self, nodes: SchedulerNodeList): + self._write_ir("ir_post_fusion.txt", nodes) + + def _write_ir(self, filename: str, nodes: SchedulerNodeList): + with self.fopen(filename) as fd: + log.info("Writing debug ir to %s", fd.name) + for node in nodes: + fd.write(node.debug_str()) + fd.write("\n\n\n") + + def graph_diagram(self, nodes: SchedulerNodeList): + draw_buffers(nodes, fname=self.filename("graph_diagram.svg")) + + def draw_orig_fx_graph(self, gm: torch.fx.GraphModule, nodes: SchedulerNodeList): + annotate_orig_fx_with_snodes(gm, nodes) + draw_graph( + gm, + fname=self.filename("orig_fx_graph_diagram.svg"), + clear_meta=False, + prog=GRAPHVIZ_COMMAND_SCALABLE, + parse_stack_trace=True, + dot_graph_shape=config.trace.dot_graph_shape, + ) + + def output_code(self, filename): + shutil.copy(filename, self.filename("output_code.py")) + + def log_autotuning_results( + self, + name: str, + input_nodes: List[ir.IRNode], + timings: Dict["ChoiceCaller", float], # type: ignore[name-defined] # noqa: F821 + elapse: float, + ): + import json + + from .ir import FixedLayout + + def build_node_info(node: ir.IRNode): + if hasattr(node, "name"): + node_name = node.name + else: + node_name = "" + node_info = { + "name": node_name, + "type": type(node).__name__, + } + try: + layout = node.get_layout() + if isinstance(layout, FixedLayout): + offset = 0 + try: + offset = int(layout.offset) + except Exception: + try: + offset = V.graph.sizevars.size_hint( + layout.offset, fallback=0 + ) + except Exception: + pass + static_layout = FixedLayout( + layout.device, + dtype=layout.dtype, + size=list(V.graph.sizevars.size_hints(layout.size)), + stride=list(V.graph.sizevars.size_hints(layout.stride)), + offset=offset, + ) + node_info["layout"] = str(static_layout) + else: + node_info["layout"] = str(node.get_layout()) + except Exception as e: + pass + try: + node_info["dtype"] = str(node.get_dtype()) + except Exception as e: + pass + try: + node_info["device"] = str(node.get_device()) + except Exception as e: + pass + try: + node_info["stride"] = str( + V.graph.sizevars.size_hints(node.get_stride()) + ) + except Exception as e: + pass + try: + node_info["size"] = str(V.graph.sizevars.size_hints(node.get_size())) + except Exception as e: + pass + try: + node_info["numel"] = str(V.graph.sizevars.size_hint(node.get_numel())) + except Exception as e: + pass + if hasattr(node, "data") and isinstance(node.data, ir.IRNode): + node_info["data"] = build_node_info(node.data) + return node_info + + general_properties = { + "op_name": name, + "cuda_device_name": torch.cuda.get_device_name(), + "cuda_device_count": torch.cuda.device_count(), + "input_nodes": [build_node_info(node) for node in input_nodes], + "autotuning_time": elapse, + } + with self.fopen_context( + "autotuning_result_json_list.txt", "at", encoding="utf-8" + ) as fd: + for caller, time in timings.items(): + info_dict = dict(caller.info_dict()) + info_dict.update(general_properties) + info_dict["benchmark_result"] = time + json.dump(info_dict, fd) + fd.write("\n") + + +@dataclasses.dataclass +class TensorMetadataHolder: + tensor_metadata: TensorMetadata + device: torch.device + + +save_args_cnt = itertools.count() + + +def save_args_for_compile_fx_inner(*args, **kwargs): + """ + This function is used to save arguments for a compile_fx_inner function call + to the file system. Later on one can replay the compile_fx_inner call + with the saved arguments using load_args_and_run_compile_fx_inner. + """ + + folder = "/tmp/inductor_saved_args" + if not os.path.exists(folder): + os.mkdir(folder) + + def handle_tensor(x): + """ + Pickle FakeTensor will result in error: + AttributeError: Can't pickle local object 'WeakValueDictionary.__init__..remove' + + Convert all Tensor to metadata. This may also makes pickle faster. + """ + if isinstance(x, torch.Tensor): + return TensorMetadataHolder(_extract_tensor_metadata(x), x.device) + else: + return x + + args_to_save, kwargs_to_save = tree_map(handle_tensor, (args, kwargs)) + + fn_name = "compile_fx_inner" + path = f"{folder}/{fn_name}_{next(save_args_cnt)}.pkl" + with open(path, "wb") as f: + pickle.dump((args_to_save, kwargs_to_save), f) + + if log.isEnabledFor(logging.DEBUG): + message = f""" +Arguments for a compile_fx_inner call is saved to {path}. To replay the call, +run the following: + +from torch._inductor.debug import load_args_and_run_compile_fx_inner +load_args_and_run_compile_fx_inner({path!r}) + """ + # call print rather than log.debug. log.debug will print message + # prefix for each line which makes the code snippet harder to be + # copied. + # Not a big deal since the code is already been guarded by checking + # the log level. + print(message) + + +def load_args_and_run_compile_fx_inner(path: str): + from torch._inductor.compile_fx import compile_fx_inner + + with open(path, "rb") as f: + args, kwargs = pickle.load(f) + + def handle_tensor(x): + if isinstance(x, TensorMetadataHolder): + return torch._dynamo.testing.rand_strided( + x.tensor_metadata.shape, + x.tensor_metadata.stride, + x.tensor_metadata.dtype, + x.device, + ) + else: + return x + + fake_mode = torch._subclasses.FakeTensorMode(allow_non_fake_inputs=True) + with fake_mode, config.patch("save_args", False): + args, kwargs = tree_map(handle_tensor, (args, kwargs)) + return compile_fx_inner(*args, **kwargs) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/decomposition.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/decomposition.py new file mode 100644 index 0000000000000000000000000000000000000000..915c44d360fb318c1b51e6b2cb7cb497b98517e6 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/decomposition.py @@ -0,0 +1,678 @@ +import functools +import logging +import math +import sys +import typing +from typing import Optional + +import torch +import torch._decomp as decomp +import torch._prims_common as utils +import torch.ao.quantization.fx._decomposed +from torch._decomp import ( + core_aten_decompositions, + get_decompositions, + remove_decompositions, +) +from torch._decomp.decompositions import ( + _grid_sampler_2d as decomp_grid_sampler_2d, + pw_cast_for_opmath, +) +from torch._decomp.decompositions_for_rng import extra_random_decomps +from torch._higher_order_ops.out_dtype import out_dtype +from torch._prims_common import ( + elementwise_dtypes, + ELEMENTWISE_TYPE_PROMOTION_KIND, + type_to_dtype, +) + +from . import config, inductor_prims + +log = logging.getLogger(__name__) +aten = torch.ops.aten +prims = torch.ops.prims +quantized_decomposed = torch.ops.quantized_decomposed + +inductor_decompositions = get_decompositions( + [ + aten._adaptive_avg_pool2d_backward, + aten.arange, + aten.bitwise_and_, + aten.bitwise_or_, + aten.clamp_min_, + aten.dist, + aten.empty_like, + aten.flip, + aten.gelu, + aten.hardtanh, + aten.index_select, + aten.lcm, + aten.leaky_relu, + aten.linalg_vector_norm, + aten._log_softmax, + aten.max_pool2d_with_indices_backward, + aten._native_batch_norm_legit, + aten._native_batch_norm_legit_functional, + aten._native_batch_norm_legit_no_training, + aten.native_batch_norm, + aten.native_group_norm, + aten.native_layer_norm, + aten.nll_loss2d_backward, + aten._softmax, + aten.sin_, + aten.sqrt_, + out_dtype, + aten._to_copy, + aten.tril_indices, + aten.triu_indices, + aten.upsample_bilinear2d.vec, + ] +) +decompositions = {**core_aten_decompositions(), **inductor_decompositions} + +# Remove unwanted decompositions included via the core ATen decompositions from +# the Inductor decomp table. +decomps_to_exclude = [ + aten._unsafe_index, + aten._scaled_dot_product_flash_attention_for_cpu.default, # See comments in torch/_decomp/decompositions.py + aten.clamp_max, + aten.clamp_min, + aten.glu, # inductor lowers this directly + aten.split.Tensor, # inductor lowers this directly + aten.squeeze, # inductor lowers this directly + aten.sum, # inductor lowers this directly + aten.unbind, # inductor lowers this directly +] + +remove_decompositions(decompositions, decomps_to_exclude) + + +def register_decomposition(ops): + for op in [ops] if callable(ops) else ops: + if op in decompositions: + log.warning("duplicate decomp: %s", ops) + return decomp.register_decomposition(ops, decompositions) + + +# TODO: for now, inductor doesn't handle asserts +# because the condition is symbool -> tensor in the graph. +@register_decomposition([aten._assert_async.msg]) +def assert_async_msg_decomp(tensor, msg): + return + + +# Following `assert_async_msg_decomp` and implement as non-op. +@register_decomposition([aten._functional_assert_async.msg]) +def functional_assert_async_msg_decomp(tensor, msg): + return + + +@register_decomposition([aten.sym_constrain_range_for_size.default]) +def sym_constrain_range_for_size(symbol, *, min=None, max=None): + return + + +@register_decomposition([aten.clamp]) +@pw_cast_for_opmath +def clamp(x, min=None, max=None): + if min is not None: + x = x.clamp_min(min) + if max is not None: + x = x.clamp_max(max) + return x + + +@register_decomposition([aten.full]) +def full(size, fill_value, **kwargs): + dtype = kwargs.get("dtype") + if dtype is None: + kwargs["dtype"] = type_to_dtype(type(fill_value)) + return aten.full(size, fill_value, **kwargs) + return NotImplemented + + +# Not really sure how to put this into the main library. PrimTorch wants +# empty_permuted to go to the prim, and typically users don't really want +# to decompose to empty_strided (but inductor is OK with it, because we are +# cool with strides and everything goes to empty_strided) +@register_decomposition([aten.empty_permuted.default]) +def empty_permuted(size, physical_layout, **kwargs): + perm = [0] * len(size) + for p, l in enumerate(physical_layout): + perm[l] = p + return torch.empty([size[l] for l in physical_layout], **kwargs).permute(perm) + + +@register_decomposition([aten.convolution_backward]) +def convolution_backward( + grad_output, + input, + weight, + bias_sizes, + stride, + padding, + dilation, + transposed, + output_padding, + groups, + output_mask, +): + if not output_mask[2] or grad_output.device.type != "cuda": + return NotImplemented + grad_bias = aten.sum(grad_output, [0] + list(range(2, grad_output.dim()))) + grad_inp, grad_weight, _ = aten.convolution_backward( + grad_output, + input, + weight, + bias_sizes, + stride, + padding, + dilation, + transposed, + output_padding, + groups, + [output_mask[0], output_mask[1], False], + ) + return (grad_inp, grad_weight, grad_bias) + + +@register_decomposition([aten.log2]) +def log2(x): + return torch.log(x) * (1.0 / math.log(2.0)) + + +@register_decomposition([aten.round.decimals]) +def round_dec(x, decimals=0): + ten_pow_decimals = 10.0**decimals + return aten.round(x * ten_pow_decimals) * (1.0 / ten_pow_decimals) + + +@register_decomposition([aten.bmm]) +@pw_cast_for_opmath +def bmm(self, batch2): + if config.coordinate_descent_tuning: + if self.shape[1] == 1 or batch2.shape[2] == 1: + out = (self.unsqueeze(-1) * batch2.unsqueeze(1)).sum(dim=2) + return out + if self.device.type == "cpu": + if self.size(1) == 1 and batch2.size(-1) == 1: + return torch.sum( + self.squeeze(1) * batch2.squeeze(-1), dim=1, keepdim=True + ).unsqueeze(1) + return NotImplemented + + +@register_decomposition([aten.addmm]) +@pw_cast_for_opmath +def addmm(self, mat1, mat2, beta=1, alpha=1): + if self.device.type == "cpu": + if mat1.size(0) == 1 and mat2.size(-1) == 1: + out = torch.sum( + mat1.squeeze(0) * mat2.squeeze(-1), dim=0, keepdim=True + ).unsqueeze(0) + return alpha * out + beta * self + if mat1.size(0) == 1 and mat2.size(0) <= 16 and mat2.size(1) <= 16: + out = (mat1.T * mat2).sum(dim=0, keepdim=True) + return alpha * out + beta * self + return NotImplemented + + +@register_decomposition([aten.mm]) +@pw_cast_for_opmath +def mm(self, input2): + from torch.fx.experimental.symbolic_shapes import ( + definitely_true, + guard_size_oblivious, + ) + + # Our matrix vector multiplies only achieve peak bandwidth with coordinate descent tuning. + # todo: Look into why and fix it (hopefully) + if config.coordinate_descent_tuning: + if self.shape[0] == 1 or input2.shape[1] == 1: + return (self.unsqueeze(2) * input2.unsqueeze(0)).sum(dim=1) + if self.device.type == "cpu": + if ( + guard_size_oblivious(self.size(-1) == 1) + and guard_size_oblivious(self.size(0) > 0) + and guard_size_oblivious(input2.size(0) == 1) + and (self.dtype == input2.dtype) + and definitely_true((torch.numel(self) + torch.numel(input2)) <= 32) + ): + return torch.cat([self[i, :] * input2 for i in range(self.size(0))]) + if guard_size_oblivious(self.size(0) == 1) and guard_size_oblivious( + input2.size(-1) == 1 + ): + return torch.sum( + self.squeeze(0) * input2.squeeze(-1), dim=0, keepdim=True + ).unsqueeze(0) + return NotImplemented + + +# This pass does two things: +# - Eliminate cat when there is only one tensor input +# - Normalize cat calls, so that legacy empty 1-D tensors are removed (NB: we +# don't remove ALL empty tensors, only the naughty ones) +@register_decomposition([aten.cat.default]) +def cat(tensors, dim=0): + from torch.fx.experimental.symbolic_shapes import guard_size_oblivious + + def non_empty_tensor(x): + # For better or worse, this is a valid cat: + # + # torch.cat([torch.randn(2, 2, 4), torch.randn(0), torch.randn(3, 2, 4)]) + # + # We'd like to eliminate naughtiness like this for downstream passes + # like split_cat. The easiest way is to just drop such inputs + # (guarding that they are non-zero). + # + # Is it permissible for this filtering to be size-oblivious? A case + # where this could matter is cat([(2, 2), (u0,)], dim=0); if u0 + # happened to be zero, we would have liked to have filtered it out. + # But actually, the ONLY way this could have passed is if u0 == 0, + # so by the time we get here we have already installed a deferred + # runtime assert forcing u0 to be zero. So if this hasn't happened, + # we know that the unbacked SymInt has appropriate size and there are + # no problems. + return len(x.shape) != 1 or guard_size_oblivious(x.shape[0] > 0) + + filtered_tensors = list(filter(non_empty_tensor, tensors)) + + if len(filtered_tensors) == 1: + return filtered_tensors[0].clone() + elif 1 < len(filtered_tensors) < len(tensors): + # on the first call, when we remove empty tensors, we redispatch recursively + return aten.cat.default(filtered_tensors, dim) + # when no 'filtering' has occurred, we raise to prevent infinite recursion (no more decomposition needed) + return NotImplemented + + +@register_decomposition([aten.angle]) +def angle(x): + if x.is_complex(): + return torch.where( + torch.isnan(x.real), float("nan"), torch.atan2(x.imag, x.real) + ) + + # when x is real number + # if x >= 0, return 0 + # if x < 0, return pi + # if x is nan, return nan + _, dtype = elementwise_dtypes( + x, + type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT, + ) + pi = torch.scalar_tensor(math.pi, dtype=dtype, device=x.device) + ret = torch.where(x < 0, pi, 0.0) + return torch.where(torch.isnan(x), float("nan"), ret) + + +@register_decomposition([aten.add]) +def add(x, y, *, alpha=None): + x_is_complex_tensor = torch.is_tensor(x) and x.is_complex() + y_is_complex_tensor = torch.is_tensor(y) and y.is_complex() + if not x_is_complex_tensor or not y_is_complex_tensor: + return NotImplemented + z = y + if alpha is not None: + z = alpha * y + complex_type = torch.promote_types(x.dtype, y.dtype) + return (x.view(x.real.dtype) + z.view(y.real.dtype)).view(complex_type) + + +@register_decomposition([aten.conj_physical]) +def conj_physical(self): + assert not self.is_complex(), "TODO: implement this" + return self + + +@register_decomposition([aten.lift, aten.detach_]) +def lift(self): + return self + + +@register_decomposition([aten.bernoulli.default]) +def bernoulli(self, *, generator=None): + assert generator is None + return (torch.rand_like(self, dtype=torch.float32) < self).to(self.dtype) + + +@register_decomposition([aten.fmin, prims.fmin]) +def fmin(self, other): + return torch.where(torch.isnan(other) | (other > self), self, other) + + +@register_decomposition([aten.fmax, prims.fmax]) +def fmax(self, other): + return torch.where(torch.isnan(other) | (other < self), self, other) + + +@register_decomposition(aten.amax) +def amax(self, dim=None, keepdim=False): + if self.dtype == torch.bool: + return torch.any(self, dim=dim, keepdim=keepdim) + return NotImplemented + + +@register_decomposition(aten.amin) +def amin(self, dim=None, keepdim=False): + if self.dtype == torch.bool: + return torch.all(self, dim=dim, keepdim=keepdim) + return NotImplemented + + +@register_decomposition([aten.narrow_copy]) +def narrow_copy(self, dim, start, length): + return torch.narrow(self, dim, start, length).clone() + + +@register_decomposition([aten.expand_copy]) +def expand_copy(self, size, *, implicit=False): + return aten.expand(self, size, implicit=implicit).clone() + + +@register_decomposition([aten.view_copy.default]) +def view_copy_default(self, size): + return aten.view(self, size).clone() + + +@register_decomposition([aten.view_copy.dtype]) +def view_copy_dtype(self, dtype): + return self.to(dtype).clone() + + +def get_like_layout( + tensor: torch.Tensor, memory_format: Optional[torch.memory_format] +) -> torch.memory_format: + # TODO: _to_copy tensor to stride permutation + if memory_format is torch.preserve_format or memory_format is None: + return utils.suggest_memory_format(tensor) + else: + return memory_format + + +@register_decomposition(aten.rand_like) +def rand_like(self, *, dtype=None, device=None, memory_format=None, **kwargs): + return torch.rand( + [*self.size()], + dtype=dtype or self.dtype, + device=device or self.device, + **kwargs, + ).to(memory_format=get_like_layout(self, memory_format)) + + +@register_decomposition(aten.randn_like) +def randn_like(self, *, dtype=None, device=None, memory_format=None, **kwargs): + return torch.randn( + [*self.size()], + dtype=dtype or self.dtype, + device=device or self.device, + **kwargs, + ).to(memory_format=get_like_layout(self, memory_format)) + + +@register_decomposition(aten.full_like) +def full_like( + self, + fill_value, + *, + dtype=None, + layout=None, + device=None, + pin_memory=False, + requires_grad=False, + memory_format=torch.preserve_format, +): + return torch.full( + [*self.size()], + fill_value, + dtype=dtype or self.dtype, + layout=layout or self.layout, + device=device or self.device, + requires_grad=requires_grad, + ).to(memory_format=get_like_layout(self, memory_format)) + + +@register_decomposition(aten.randint_like.default) +def randint_like(self, high, *, dtype=None, device=None, memory_format=None, **kwargs): + return aten.randint.low( + 0, + high, + [*self.size()], + dtype=dtype or self.dtype, + device=device or self.device, + **kwargs, + ).to(memory_format=get_like_layout(self, memory_format)) + + +@register_decomposition(aten.randint_like.low_dtype) +def randint_like_low( + self, low, high, *, dtype=None, device=None, memory_format=None, **kwargs +): + return aten.randint.low( + low, + high, + [*self.size()], + dtype=dtype or self.dtype, + device=device or self.device, + **kwargs, + ).to(memory_format=get_like_layout(self, memory_format)) + + +@register_decomposition(aten.randint.default) +def randint(high, size, **kwargs): + return aten.randint.low(0, high, size, **kwargs) + + +# The difference between quantize_per_tensor.default and quantize_per_tensor.tensor is +# scale and zero_point is scalar or scalar tensor +@register_decomposition(quantized_decomposed.quantize_per_tensor.default) +def quantize_per_tensor_default_decomp_impl( + input: torch.Tensor, + scale: float, + zero_point: int, + quant_min: int, + quant_max: int, + dtype: torch.dtype, +) -> torch.Tensor: + if input.dtype == torch.bfloat16: + input = input.to(torch.float32) + inv_scale = 1.0 / scale + return torch.clamp( + torch.round(input * inv_scale) + zero_point, quant_min, quant_max + ).to(dtype) + + +# The difference between dequantize_per_tensor.default and dequantize_per_tensor.tensor is +# scale and zero_point is scalar or scalar tensor +@register_decomposition(quantized_decomposed.dequantize_per_tensor.default) +def dequantize_per_tensor_default_decomp_impl( + input: torch.Tensor, + scale: float, + zero_point: int, + quant_min: int, + quant_max: int, + dtype: torch.dtype, +) -> torch.Tensor: + return (input.to(torch.float32) - zero_point) * scale + + +@register_decomposition(quantized_decomposed.quantize_per_tensor.tensor) +def quantize_per_tensor_tensor_decomp_impl( + input: torch.Tensor, + scale: torch.Tensor, + zero_point: torch.Tensor, + quant_min: int, + quant_max: int, + dtype: torch.dtype, +) -> torch.Tensor: + if input.dtype == torch.bfloat16: + input = input.to(torch.float32) + inv_scale = 1.0 / scale + return torch.clamp( + torch.round(input * inv_scale) + zero_point, quant_min, quant_max + ).to(dtype) + + +@register_decomposition(quantized_decomposed.dequantize_per_tensor.tensor) +def dequantize_per_tensor_tensor_decomp_impl( + input: torch.Tensor, + scale: torch.Tensor, + zero_point: torch.Tensor, + quant_min: int, + quant_max: int, + dtype: torch.dtype, +) -> torch.Tensor: + return (input.to(torch.float32) - zero_point.to(torch.int32)) * scale.to( + torch.float32 + ) + + +@register_decomposition(torch.ops.quantized.embedding_bag_byte_unpack) +def q_embedding_bag_byte_unpack_decomp(packed): + def bitcast_u8_to_f32(u8): + x, y, z, w = (u8[..., n].to(torch.int32) for n in (0, 1, 2, 3)) + if sys.byteorder == "little": + return (x + (y << 8) + (z << 16) + (w << 24)).view(torch.float32)[..., None] + else: + return ((x << 24) + (y << 16) + (z << 8) + w).view(torch.float32)[..., None] + + scales = bitcast_u8_to_f32(packed[..., -8:-4]) + offsets = bitcast_u8_to_f32(packed[..., -4:]) + return packed[..., :-8].to(torch.float32) * scales + offsets + + +@register_decomposition([aten.grid_sampler_2d]) +@pw_cast_for_opmath +def grid_sampler_2d( + a: torch.Tensor, + grid: torch.Tensor, + interpolation_mode: int = 0, + padding_mode: int = 0, + align_corners: bool = False, +) -> torch.Tensor: + # We do not expand the grid (_expand_grid=False) on cpu for performance reasons + # Experimenting locally it was found that compiled CUDA code is accelerated by ~5x + # and CPU code by ~2x on bicubic mode, if we expand the grid from (N, H, W, 2) into (N, C, H, W, 2) + # However, this leads to a slowdown around ~0.8x on CPU bilinear mode, channels first. + # Thus we apply this hack to not expand the grid for this case. + _expand_grid = not ( + a.device == torch.device("cpu") + and interpolation_mode == 0 + and a.is_contiguous(memory_format=torch.contiguous_format) + ) + + output = decomp_grid_sampler_2d( + a, + grid=grid, + interpolation_mode=interpolation_mode, + padding_mode=padding_mode, + align_corners=align_corners, + _expand_grid=_expand_grid, + ) + return output + + +@register_decomposition(aten._foreach_addcmul.Scalar) +def _foreach_addcmul_scalar(self, left_tensors, right_tensors, scalar=1): + return aten._foreach_add.List( + self, aten._foreach_mul.List(left_tensors, right_tensors), alpha=scalar + ) + + +@register_decomposition(aten._foreach_addcdiv.Scalar) +def _foreach_addcdiv_scalar(self, left_tensors, right_tensors, scalar=1): + return aten._foreach_add.List( + self, aten._foreach_div.List(left_tensors, right_tensors), alpha=scalar + ) + + +@register_decomposition(aten._foreach_lerp.Scalar) +def _foreach_lerp_scalar(start_tensors, end_tensors, weight): + return aten._foreach_add.List( + start_tensors, + aten._foreach_mul.Scalar( + aten._foreach_sub.List(end_tensors, start_tensors), weight + ), + ) + + +@aten.miopen_batch_norm.default.py_impl(torch._C.DispatchKey.Autograd) +@register_decomposition(aten.miopen_batch_norm) +def miopen_batch_norm( + input: torch.Tensor, + weight: torch.Tensor, + bias: typing.Optional[torch.Tensor], + running_mean: typing.Optional[torch.Tensor], + running_var: typing.Optional[torch.Tensor], + training: bool, + exponential_average_factor: float, + epsilon: float, +): + a, b, c = aten.native_batch_norm( + input, + weight, + bias, + running_mean, + running_var, + training, + exponential_average_factor, + epsilon, + ) + + if training: + return (a, b, c) + return ( + a, + weight.new_zeros((0,)), + weight.new_zeros((0,)), + ) + + +@functools.lru_cache(None) +def fast_random_decomps(): + return {**decompositions, **extra_random_decomps} + + +def select_decomp_table(): + """decomps can change based on config""" + if config.fallback_random: + return decompositions + return fast_random_decomps() + + +@register_decomposition(aten.masked_scatter) +def masked_scatter(self, mask, source): + if self.device.type == "cuda": + # This two-step algorithm is the same as eager CUDA, for eager CPU we + # use a 1-shot serial iteration. + self, mask = aten.broadcast_tensors([self, mask]) + source_idx = mask.reshape(-1).cumsum(0) - 1 + return inductor_prims.masked_scatter_with_index(self, mask, source_idx, source) + return NotImplemented + + +@register_decomposition(quantized_decomposed.choose_qparams.tensor) +def choose_qparams_tensor( + input: torch.Tensor, quant_min: int, quant_max: int, eps: float, dtype: torch.dtype +): + min_val, max_val = torch.aminmax(input) + scale = (max_val - min_val) / float(quant_max - quant_min) + scale = torch.max(scale, torch.Tensor([eps])) + zero_point = quant_min - torch.round(min_val / scale).to(torch.int) + zero_point = torch.clamp(zero_point, quant_min, quant_max) + return scale.to(torch.float64), zero_point.to(torch.int64) + + +@register_decomposition(aten.put) +def put(self, index, source, accumulate=False): + flattened = self.flatten() + flattened = torch.index_put( + flattened, [index], source.reshape(index.shape), accumulate + ) + return flattened.reshape(self.shape) + + +@register_decomposition(aten.put_) +def put_(self, index, source, accumulate=False): + out = aten.put(self, index, source, accumulate=accumulate) + return self.copy_(out) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/test_operators.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/test_operators.py new file mode 100644 index 0000000000000000000000000000000000000000..e8421722568cae2d438a34ebb4b4e24d98e3ce1f --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/test_operators.py @@ -0,0 +1,24 @@ +import torch.library +from torch import Tensor +from torch.autograd import Function + +_test_lib_def = torch.library.Library("_inductor_test", "DEF") +_test_lib_def.define("realize(Tensor self) -> Tensor", tags=torch.Tag.pt2_compliant_tag) + +_test_lib_impl = torch.library.Library("_inductor_test", "IMPL") +for dispatch_key in ("CPU", "CUDA", "Meta"): + _test_lib_impl.impl("realize", lambda x: x.clone(), dispatch_key) + + +class Realize(Function): + @staticmethod + def forward(ctx, x): + return torch.ops._inductor_test.realize(x) + + @staticmethod + def backward(ctx, grad_output): + return grad_output + + +def realize(x: Tensor) -> Tensor: + return Realize.apply(x) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/virtualized.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/virtualized.py new file mode 100644 index 0000000000000000000000000000000000000000..07c6ea8190a6188d0e8521f937bf1cde026b392c --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/virtualized.py @@ -0,0 +1,351 @@ +""" +This file provides a number of "global" variables/handlers that are actually +thread local and dynamically scoped, with Inductor patching them to various +implementations depending on the situation. + +These handlers are interacted with in a fairly stylized way. Typically, +we will import V from this module:: + + from .virtualized import V + +Various handlers are accessible as attributes on this module; for example, +you might access ``V.graph.sizevars.size_hint`` to resolve a size hint associated with +a number. + +There are a few distinct usage patterns for virtualized global variables: + +1. Implicit argument passing. Examples: ``V.current_node``, ``V.aot_compilation``. + Use ``V.set_current_node`` to change what the current node is while we're + executing some region of code, so code inside that region can query ``V.current_node`` + to find out what it is. This is often more convenient than manually threading + the current node as an argument through all call stacks. + +2. Per-compilation global state. Examples: ``V.fake_mode``, ``V.graph``. For a + given ``compile_fx`` invocation, these typically don't change, but they are + associated with some internal state so they cannot just be global functions. + We install these objects at the beginning of compilation and then you can + conveniently access them without having to pass them around. + +3. Alternate define-by-run interpretations. Examples: ``V.ops``, ``V.kernel``. + A commonly used IR in Inductor is define-by-run: instead of maintaining + explicit syntax data structures, we instead represent loop bodies as + callable functions, which internally invoke operations defined on + ``V.ops``. To perform semantic analysis, print or code generate these + operations, we dynamically patch ``V.ops`` with an alternate handler with + the intended semantics and then run the callable function. For example, to + extract out a traditional (FX) graph representation of the define-by-run + IR, simply install a handler that records each ``ops`` call to a graph. + + TODO: Define a parent class / protocol that defines all of the operations + V.ops is expected to support. + +It is typically an error to access a virtualized global without having installed +an appropriate handler (you will get a NullHandler), although in some cases we +provide a default implementation. + +One last thing: although most virtualized globals are accessed via ``V``, ``ops`` is +ubiquitous enough to have its own top level variable, so you will typically see +``ops.constant(...)`` rather than ``V.ops.constant(...)``. In fact, these are not +equivalent; the former interface supports arithmetic overloads like ``x + y`` +instead of forcing ``ops.add(x, y)``, so it should be preferred. + +Some operators are seemingly unused, but they are implicitly used by ops_wrapper. +In particular, we typically have an operator for every basic pointwise PyTorch operation +supported. +""" + +from __future__ import annotations + +from contextlib import AbstractContextManager, contextmanager +from threading import local +from typing import Any, Callable, Generic, List, Type, TYPE_CHECKING, TypeVar, Union + +from .ops_handler import ( # noqa: F401 + KernelFormatterHandler, + MockHandler, + OpsHandler, + ReductionType, + StoreMode, + WrapperHandler, +) + +if TYPE_CHECKING: + import torch + from torch._inductor.debug import DebugContext + from torch._inductor.graph import GraphLowering + from torch._inductor.ir import InterpreterShim + from torch._subclasses import FakeTensorMode + +threadlocal = local() + +T = TypeVar("T") + + +class NullHandler: + """ + Sentinel indicating that a global variable is unset ala None. Typically, + attempting to access the global variable before it's set is an error, but with + NullHandler it won't fail until you try to access an attribute on it. + """ + + pass + + +class Virtualized(Generic[T]): + """ + Implements a global variable that redirects via thread local variable + (NB: construct this class to create the global variable; this is not + a singleton class!) + + This allows us to swap in different op implementations in codegen. + + NB: Despite the fact that we typically call these "handlers" (e.g., NullHandler is + the default value of the variable), we sometimes use these variables to + store other things, like booleans. + """ + + def __init__(self, vname: str, default: Union[Callable[[], T], Type[NullHandler]]): + self._key: str = f"__torchinductor_{vname}" + self._default = default + + def _set_handler(self, value: T) -> AbstractContextManager[None]: + prior = self._get_handler() + setattr(threadlocal, self._key, value) + + @contextmanager + def ctx(): + try: + yield + finally: + self._set_handler(prior) + + return ctx() + + def _get_handler(self) -> T: + try: + return getattr(threadlocal, self._key) + except AttributeError: + # TODO: To be honest, I feel we probably should just error in this + # case, instead of making a null handler that will probably error + # when you getattr on it + return self._default() # type: ignore[return-value] + + def __getattr__(self, name: str) -> Any: + return getattr(self._get_handler(), name) + + +class NullKernelHandler(NullHandler): + """ + We need access `V.kernel.removed_buffers` in DeferredLine class when there + is no kernel in the context. This happens when codegening the wrapper. + Initialize `removed_buffers` and `inplaced_to_remove` explicitly so we don't + need call 'getattr' with default value which is error prone to typo in + attribute name. + """ + + def __init__(self): + super().__init__() + self.removed_buffers = set() + self.inplaced_to_remove = set() + self.index_dtype = "tl.int64" + + +_ops: Virtualized[OpsHandler[Any]] = Virtualized("ops", MockHandler) +_graph: Virtualized[GraphLowering] = Virtualized("graph", NullHandler) +_real_inputs: Virtualized[List[torch.Tensor]] = Virtualized("real_inputs", NullHandler) +_fake_mode: Virtualized[FakeTensorMode] = Virtualized("fake_mode", NullHandler) +_kernel: Virtualized[NullKernelHandler] = Virtualized( + "kernel", NullKernelHandler +) # TODO: improve type +_debug: Virtualized[DebugContext] = Virtualized("debug", NullHandler) +_interpreter: Virtualized[InterpreterShim] = Virtualized("interpreter", NullHandler) +_aot_compilation: Virtualized[bool] = Virtualized("aot_compilation", NullHandler) +_current_node: Virtualized[torch.fx.Node] = Virtualized("current_node", NullHandler) + + +class OpsValue: + """The return type of most ops calls. + + This exists so we can overload magic methods, and write mathematical + expressions much more fluently. So instead of + + ops.add(ops.mul(ops.mul(ops.sub(ops.mul(_Ap2, x), _Ap3), x), x), _1) + + we can write + + (_Ap2 * x - _Ap3) * x * x + _1 + + """ + + value: Any + + def __init__(self, value): + self.value = value + + def __str__(self): + return str(self.value) + + def __repr__(self): + return f"OpsValue({self.value!r})" + + def __add__(self, other): + return ops.add(self, other) + + def __mul__(self, other): + return ops.mul(self, other) + + def __sub__(self, other): + return ops.sub(self, other) + + def __neg__(self): + return ops.neg(self) + + def __truediv__(self, other): + return ops.truediv(self, other) + + def __floordiv__(self, other): + return ops.floordiv(self, other) + + def __mod__(self, other): + return ops.mod(self, other) + + def __pow__(self, other): + return ops.pow(self, other) + + def __lt__(self, other): + return ops.lt(self, other) + + def __le__(self, other): + return ops.le(self, other) + + def __eq__(self, other): + return ops.eq(self, other) + + def __ne__(self, other): + return ops.ne(self, other) + + def __gt__(self, other): + return ops.gt(self, other) + + def __ge__(self, other): + return ops.ge(self, other) + + def __and__(self, other): + return ops.bitwise_and(self, other) + + def __or__(self, other): + return ops.bitwise_or(self, other) + + def __xor__(self, other): + return ops.bitwise_xor(self, other) + + def __invert__(self): + return ops.bitwise_not(self) + + def __rshfit__(self, n): + return ops.bitwise_right_shift(self, n) + + def __lshift__(self, n): + return ops.bitwise_left_shift(self, n) + + +class OpsWrapper: + """This wraps any returned IR values into an `OpsValue` instance, so that we + can overload the magic methods for writing mathematical expressions fluently. + """ + + def __getattr__(self, name): + def inner(*args, **kwargs): + new_args = [OpsWrapper._unwrap(a) for a in args] + new_kwargs = {k: OpsWrapper._unwrap(v) for k, v in kwargs.items()} + return OpsWrapper._wrap(getattr(_ops, name)(*new_args, **new_kwargs)) + + return inner + + @staticmethod + def _unwrap(x): + if isinstance(x, (list, tuple)): + return tuple(OpsWrapper._unwrap(v) for v in x) + if isinstance(x, OpsValue): + return x.value + return x + + @staticmethod + def _wrap(x): + if isinstance(x, (list, tuple)): + return tuple(OpsValue(v) for v in x) + return OpsValue(x) + + @staticmethod + def indirect_indexing(index, size, check=True): + # Returns a sympy value, not IR value + index = OpsWrapper._unwrap(index) + return _ops.indirect_indexing(index, size, check) + + +ops = OpsWrapper() + + +class _V: + MockHandler = MockHandler + KernelFormatterHandler = KernelFormatterHandler + WrapperHandler = WrapperHandler + + set_ops_handler: Callable[[Any], Any] = _ops._set_handler + get_ops_handler: Callable[[], Any] = _ops._get_handler + set_graph_handler: Callable[[GraphLowering], Any] = _graph._set_handler + set_real_inputs: Callable[[Any], Any] = _real_inputs._set_handler + get_real_inputs: Callable[[], Any] = _real_inputs._get_handler + set_fake_mode: Callable[[Any], Any] = _fake_mode._set_handler + get_fake_mode: Callable[[], Any] = _fake_mode._get_handler + set_kernel_handler: Callable[[Any], Any] = _kernel._set_handler + set_debug_handler: Callable[[Any], Any] = _debug._set_handler + set_interpreter_handler: Callable[[Any], Any] = _interpreter._set_handler + set_aot_compilation: Callable[[bool], Any] = _aot_compilation._set_handler + get_aot_compilation: Callable[[], Any] = _aot_compilation._get_handler + set_current_node: Callable[[Any], Any] = _current_node._set_handler + get_current_node: Callable[[], Any] = _current_node._get_handler + + @property + def ops(self) -> OpsHandler[Any]: + """The operator handler specific to the current codegen task""" + return _ops._get_handler() + + @property + def graph(self) -> GraphLowering: + """The graph currently being generated""" + return _graph._get_handler() + + @property + def real_inputs(self): + """non-fake example inputs""" + return _real_inputs._get_handler() + + @property + def fake_mode(self): + """The graph currently being generated""" + return _fake_mode._get_handler() + + @property + def kernel(self): + """The kernel currently being generated""" + return _kernel._get_handler() + + @property + def debug(self): + return _debug._get_handler() + + @property + def interpreter(self): + return _interpreter._get_handler() + + @property + def aot_compilation(self): + return _aot_compilation._get_handler() + + @property + def current_node(self): + return _current_node._get_handler() + + +V = _V() diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Activation.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Activation.h new file mode 100644 index 0000000000000000000000000000000000000000..dca6a39a0970e9a87457d500730fbe9ab6788933 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Activation.h @@ -0,0 +1,98 @@ +#pragma once + +#include +#include +#include + +namespace c10 { +class Scalar; +} + +namespace at { +struct TensorIterator; +struct TensorIteratorBase; +class TensorBase; +} + +namespace at::native { + +// These constants control the approximation behavior of gelu function. +enum class GeluType { + None, // Baseline Gelu + Tanh, // Tahn Gelu Approximation + END +}; + +static GeluType get_gelutype_enum(const c10::string_view approximate) { + if (approximate == "none") { + return GeluType::None; + } else if (approximate == "tanh") { + return GeluType::Tanh; + } else { + TORCH_CHECK(false, "approximate argument must be either none or tanh."); + } +} + +static std::string gelutype_to_string(const GeluType type) { + switch(type) { + case GeluType::None: return "none"; + case GeluType::Tanh: return "tanh"; + default: TORCH_CHECK(false, "unknown GELU type: ", static_cast(type)); + } +} + +using structured_activation_fn = void (*)(TensorIteratorBase&); +using structured_activation_backward_fn = void (*)(TensorIteratorBase&); + +using activation_fn = void (*)(TensorIterator&); +using activation_backward_fn = void (*)(TensorIterator&); +using softplus_fn = void (*)(TensorIteratorBase&, const c10::Scalar&, const c10::Scalar&); +using softplus_backward_fn = void (*)(TensorIteratorBase&, const c10::Scalar&, const c10::Scalar&); +using threshold_fn = void (*)(TensorIteratorBase&, const c10::Scalar&, const c10::Scalar&); +using hardtanh_backward_fn = void (*)(TensorIterator&, const c10::Scalar&, const c10::Scalar&); +using hardsigmoid_fn = void(*)(TensorIteratorBase&); +using hardsigmoid_backward_fn = void(*)(TensorIteratorBase&); +using hardswish_fn = void(*)(TensorIterator&); +using hardswish_backward_fn = void(*)(TensorIterator&); +using shrink_fn = void (*)(TensorIteratorBase&, const c10::Scalar&); +using softshrink_fn = void (*)(TensorIteratorBase&, const c10::Scalar&); +using shrink_backward_fn = void (*)(TensorIteratorBase&, const c10::Scalar&); +using elu_fn = void (*)(TensorIteratorBase&, const c10::Scalar&, const c10::Scalar&, const c10::Scalar&); +using elu_backward_fn = void (*)(TensorIteratorBase&, const c10::Scalar&, const c10::Scalar&, const c10::Scalar&, bool); +using leaky_relu_fn = void (*)(TensorIteratorBase&, const c10::Scalar&); +using leaky_relu_backward_fn = void (*)(TensorIteratorBase&, const c10::Scalar&); +using log_sigmoid_cpu_fn = void (*)(TensorBase&, TensorBase&, const TensorBase&); +using gelu_fn = void (*)(TensorIteratorBase&, GeluType); +using gelu_backward_fn = void (*)(TensorIteratorBase&, GeluType); +using glu_jvp_fn = void (*)(TensorIteratorBase&); + +DECLARE_DISPATCH(elu_fn, elu_stub); +DECLARE_DISPATCH(elu_backward_fn, elu_backward_stub); +DECLARE_DISPATCH(softplus_fn, softplus_stub); +DECLARE_DISPATCH(softplus_backward_fn, softplus_backward_stub); +DECLARE_DISPATCH(log_sigmoid_cpu_fn, log_sigmoid_cpu_stub); +DECLARE_DISPATCH(activation_backward_fn, log_sigmoid_backward_stub); +DECLARE_DISPATCH(threshold_fn, threshold_stub); +DECLARE_DISPATCH(gelu_fn, GeluKernel); +DECLARE_DISPATCH(gelu_backward_fn, GeluBackwardKernel); +DECLARE_DISPATCH(hardtanh_backward_fn, hardtanh_backward_stub); +DECLARE_DISPATCH(hardsigmoid_fn, hardsigmoid_stub); +DECLARE_DISPATCH(hardsigmoid_backward_fn, hardsigmoid_backward_stub); +DECLARE_DISPATCH(hardswish_fn, hardswish_stub); +DECLARE_DISPATCH(hardswish_backward_fn, hardswish_backward_stub); +DECLARE_DISPATCH(shrink_fn, hardshrink_stub); +DECLARE_DISPATCH(softshrink_fn, softshrink_stub); +DECLARE_DISPATCH(shrink_backward_fn, shrink_backward_stub); +DECLARE_DISPATCH(leaky_relu_fn, leaky_relu_stub); +DECLARE_DISPATCH(leaky_relu_backward_fn, leaky_relu_backward_stub); +DECLARE_DISPATCH(structured_activation_fn, glu_stub); +DECLARE_DISPATCH(activation_backward_fn, glu_backward_stub); +DECLARE_DISPATCH(glu_jvp_fn, glu_jvp_stub); +DECLARE_DISPATCH(structured_activation_fn, silu_stub); +DECLARE_DISPATCH(structured_activation_backward_fn, silu_backward_stub); +DECLARE_DISPATCH(structured_activation_fn, mish_stub); +DECLARE_DISPATCH(activation_backward_fn, mish_backward_stub); +DECLARE_DISPATCH(activation_fn, prelu_stub); +DECLARE_DISPATCH(activation_backward_fn, prelu_backward_stub); + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/AdaptivePooling.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/AdaptivePooling.h new file mode 100644 index 0000000000000000000000000000000000000000..d342d218e449ae0af1f6933312d7ba3b2362a00d --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/AdaptivePooling.h @@ -0,0 +1,39 @@ +#pragma once + +#include +#include +#include +#include +#include + +namespace at::native { + +using adaptive_avg_pooling_fn = void(*)(Tensor& output, const Tensor& input, IntArrayRef output_size); +using adaptive_avg_pooling_backward_fn = void(*)(Tensor& grad_input, const Tensor& grad_output); +DECLARE_DISPATCH(adaptive_avg_pooling_fn, adaptive_avg_pool2d_kernel); +DECLARE_DISPATCH(adaptive_avg_pooling_backward_fn, adaptive_avg_pool2d_backward_kernel); + +using adaptive_max_pooling_fn = void(*)(const Tensor& output, const Tensor& indices, const Tensor& input, IntArrayRef output_size); +using adaptive_max_pooling_backward_fn = void(*)(const Tensor& grad_input, const Tensor& grad_output, const Tensor& indices); +DECLARE_DISPATCH(adaptive_max_pooling_fn, adaptive_max_pool2d_kernel); +DECLARE_DISPATCH(adaptive_max_pooling_backward_fn, adaptive_max_pool2d_backward_kernel); + +static inline int64_t start_index(int64_t a, int64_t b, int64_t c) { + return (a / b) * c + ((a % b) * c) / b; +} + +static inline int64_t end_index(int64_t a, int64_t b, int64_t c) { + return 1 + ((a + 1) * c - 1) / b; +} + +static inline void adaptive_pool_empty_output_check(const Tensor& gradOutput_, const char* arg_name) { + int64_t ndim = gradOutput_.ndimension(); + for (const auto i : c10::irange(1, ndim)) { + TORCH_CHECK(gradOutput_.size(i) > 0, + arg_name, "(): Expected grad_output to have non-zero size for non-batch dimensions, " + "but grad_output has sizes ", gradOutput_.sizes(), " with dimension ", i, + " being empty"); + } +} + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/BucketizationUtils.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/BucketizationUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..59d459bd9c29edf165bc396d80c0ad7b5847c4aa --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/BucketizationUtils.h @@ -0,0 +1,173 @@ +#pragma once + +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + +namespace at::native { + +// original values given by raw_*. If an original value is not contiguous, will make a contiguous copy to +// the corresponding trimmed_* value. Additionally, if the dtypes of the boundary and input tensor do not +// match, will change them to be a common super type so comparisons are done between the same types. +// For any trimmed_* tensor, if its outgoing value matches what it was incoming (typically null), then the +// corresponding raw_* version should be used since it was already contiguous of the right type. +inline void searchsorted_maybe_trim_input_tensors( + Tensor& trimmed_input, + Tensor& trimmed_boundaries, + Tensor& trimmed_sorter, + const Tensor& raw_input, + const Tensor& raw_boundaries, + const Tensor& raw_sorter) { + bool in_is_contiguous = raw_input.is_contiguous(); + bool bd_is_contiguous = raw_boundaries.is_contiguous(); + bool sort_is_contiguous = raw_sorter.is_contiguous(); + + if (!in_is_contiguous) { + TORCH_WARN_ONCE("torch.searchsorted(): input value tensor is non-contiguous, this will lower the performance due " + "to extra data copy when converting non-contiguous tensor to contiguous, please use contiguous input value " + "tensor if possible. This message will only appear once per program."); + trimmed_input = raw_input.contiguous(); + } + if (!bd_is_contiguous) { + TORCH_WARN_ONCE("torch.searchsorted(): boundary tensor is non-contiguous, this will lower the performance due " + "to extra data copy when converting non-contiguous tensor to contiguous, please use contiguous boundary " + "tensor if possible. This message will only appear once per program."); + trimmed_boundaries = raw_boundaries.contiguous(); + } + if (!sort_is_contiguous) { + TORCH_WARN_ONCE("torch.searchsorted(): sorter tensor is non-contiguous, this will lower the performance due " + "to extra data copy when converting non-contiguous tensor to contiguous, please use contiguous sorter " + "tensor if possible. This message will only appear once per program."); + trimmed_sorter = raw_sorter.contiguous(); + } + if (raw_input.dtype() != raw_boundaries.dtype()) { + at::native::ResultTypeState state = {}; + state = at::native::update_result_type_state(raw_boundaries, state); + state = at::native::update_result_type_state(raw_input, state); + ScalarType common_stype = at::native::result_type(state); + + TORCH_INTERNAL_ASSERT(common_stype != ScalarType::Undefined); + if (common_stype != raw_input.scalar_type()) { + trimmed_input = in_is_contiguous ? raw_input.to(common_stype) : trimmed_input.to(common_stype); + } + if (common_stype != raw_boundaries.scalar_type()) { + trimmed_boundaries = bd_is_contiguous ? raw_boundaries.to(common_stype) : trimmed_boundaries.to(common_stype); + } + } +} + +/* unused but needed for internal jagged tensor class */ +inline void searchsorted_maybe_trim_input_tensors( + Tensor& trimmed_input, + Tensor& trimmed_boundaries, + const Tensor& raw_input, + const Tensor& raw_boundaries) { + Tensor trimmed_sorter; + Tensor raw_sorter; + return searchsorted_maybe_trim_input_tensors( + trimmed_input, + trimmed_boundaries, + trimmed_sorter, + raw_input, + raw_boundaries, + raw_sorter); +} + +inline bool searchsorted_dims_matched_before_last_dim(const Tensor& boundaries, const Tensor& input) { + if (boundaries.dim() != input.dim()) { + return false; + } + const auto& dims_bd = boundaries.sizes(); + const auto& dims_in = input.sizes(); + for (int64_t dim = 0; dim + 1 < boundaries.dim(); ++dim) { + if (dims_bd[dim] != dims_in[dim]) { + return false; + } + } + return true; +} + +inline Tensor searchsorted_scalar_tensor(const Scalar& scalar, const c10::Device& device) { + auto tensor = c10::scalar_to_tensor(scalar, device); + // This is to adopt the scalar promotion rules defined in native/TypeProperties.h + // So we have the same type promotion rules as binary operations. + tensor.unsafeGetTensorImpl()->set_wrapped_number(true); + return tensor; +} + +inline void searchsorted_pre_check( + const Tensor& boundaries, + const Tensor& input, + const Tensor& output, + const bool out_int32, + const bool right, + const c10::optional side_opt, + const Tensor& sorter) { + if (side_opt) { + const c10::string_view side = *side_opt; + TORCH_CHECK(side == "left" || side == "right", "torch.searchsorted(): side can only be 'left' or 'right' but ", + "got ", side); + + // assume the user has not explicitly set (right=False, side="right") + TORCH_CHECK(!right || side == "right", "torch.searchsorted(): side and right can't be set to opposites, got side " + "of ", side, " while right was True"); + } + + TORCH_CHECK(boundaries.device() == input.device(), "torch.searchsorted(): boundaries and input value tensors ", + "should have same device type, but got boundaries tensor device type ", boundaries.device(), " and input value ", + "tensor device type ", input.device()); + + if (sorter.defined()) { + TORCH_CHECK(sorter.device() == boundaries.device(), "torch.searchsorted(): sorter and boundary tensors should ", + "have same device type, but got sorter tensor device type ", sorter.device(), " and input value tensor ", + "device type ", boundaries.device()); + + TORCH_CHECK(sorter.sizes() == boundaries.sizes(), "torch.searchsorted(): boundary and sorter must have the same " + "size, but got boundary tensor ", boundaries.sizes(), "and got sorter tensor ", sorter.sizes()); + + TORCH_CHECK(sorter.scalar_type() == ScalarType::Long, "torch.searchsorted(): sorter must be a tensor of long ", + "dtype but got dtype ", sorter.scalar_type()); + + if (sorter.numel() > 0) { + auto minmax = sorter.aminmax(); + int64_t vmin = std::get<0>(minmax).item().toLong(); + int64_t vmax = std::get<1>(minmax).item().toLong(); + TORCH_CHECK(vmin >= 0 && vmax < sorter.sizes().back(), "torch.searchsorted(): sorter index out of range"); + } + } + + TORCH_CHECK(input.dim() > 0 || (input.dim() == 0 && input.numel() == 1 && boundaries.dim() == 1), + "torch.searchsorted(): input value can be a scalar only when boundaries tensor dimension is 1, but we got ", + "boundaries tensor dim(", boundaries.dim(), ") and input value's dim(", input.dim(), ") numel(", + input.numel(), ")"); + + TORCH_CHECK(boundaries.dim() != 0, "torch.searchsorted(): boundaries tensor should have positive dimension, but ", + "got 0 dimension"); + + TORCH_CHECK(boundaries.dim() == 1 || searchsorted_dims_matched_before_last_dim(boundaries, input), + "torch.searchsorted(): boundaries tensor should be 1 dimension or the first N-1 dimensions of boundaries tensor ", + "and input value tensor must match, but we got boundaries tensor ", boundaries.sizes(), " and input value tensor ", + input.sizes()); + + ScalarType output_dtype = output.scalar_type(); + TORCH_CHECK( + (output_dtype == ScalarType::Long && !out_int32) || + (output_dtype == ScalarType::Int && out_int32), + "torch.searchsorted(): output tensor's dtype is wrong, it can only be Int(int32) or Long(int64) depending on ", + "whether out_int32 flag is True, but we got output tensor's dtype ", output_dtype, + " and out_int32 flag is ", (out_int32 ? "True" : "False")); + + if (out_int32) { + TORCH_CHECK(boundaries.sizes().back() < INT_MAX, + "torch.searchsorted(): the size of boundaries' last dimension should be less than ", INT_MAX, ", but we got ", + boundaries.sizes().back()); + } +} + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ConvUtils.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ConvUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..5d2691b9761ee21294a03351be629d1bd139a87b --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ConvUtils.h @@ -0,0 +1,446 @@ +#pragma once +#include +#include +#include +#include +#include +#include + +namespace at::native { + +using conv_depthwise2d_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, at::IntArrayRef, std::array); +DECLARE_DISPATCH(conv_depthwise2d_backward_fn, conv_depthwise2d_backward_stub); +using conv_depthwise3d_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, at::IntArrayRef, std::array); +DECLARE_DISPATCH(conv_depthwise3d_backward_fn, conv_depthwise3d_backward_stub); +using cudnn_convolution_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, int64_t, bool, bool, bool, std::array); +DECLARE_DISPATCH(cudnn_convolution_backward_fn, cudnn_convolution_backward_stub); +using mps_convolution_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, int64_t, std::array); +DECLARE_DISPATCH(mps_convolution_backward_fn, mps_convolution_backward_stub); +using cudnn_convolution_transpose_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, at::IntArrayRef, int64_t, bool, bool, bool, std::array); +DECLARE_DISPATCH(cudnn_convolution_transpose_backward_fn, cudnn_convolution_transpose_backward_stub); +using miopen_convolution_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, int64_t, bool, bool, std::array); +DECLARE_DISPATCH(miopen_convolution_backward_fn, miopen_convolution_backward_stub); +using miopen_convolution_transpose_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, at::IntArrayRef, int64_t, bool, bool, std::array); +DECLARE_DISPATCH(miopen_convolution_transpose_backward_fn, miopen_convolution_transpose_backward_stub); +using miopen_depthwise_convolution_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, int64_t, bool, bool, std::array); +DECLARE_DISPATCH(miopen_depthwise_convolution_backward_fn, miopen_depthwise_convolution_backward_stub); +using mkldnn_convolution_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, int64_t, std::array); +DECLARE_DISPATCH(mkldnn_convolution_backward_fn, mkldnn_convolution_backward_stub); +using mkldnn_convolution_transpose_fn = Tensor(*)(const Tensor&, const Tensor&, const c10::optional&, + IntArrayRef, IntArrayRef, IntArrayRef, IntArrayRef, int64_t); +DECLARE_DISPATCH(mkldnn_convolution_transpose_fn, mkldnn_convolution_transpose_stub); +using mkldnn_convolution_transpose_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, at::IntArrayRef, int64_t, std::array); +DECLARE_DISPATCH(mkldnn_convolution_transpose_backward_fn, mkldnn_convolution_transpose_backward_stub); +using slow_conv_dilated2d_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, at::IntArrayRef, std::array); +DECLARE_DISPATCH(slow_conv_dilated2d_backward_fn, slow_conv_dilated2d_backward_stub); +using slow_conv_dilated3d_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, at::IntArrayRef, std::array); +DECLARE_DISPATCH(slow_conv_dilated3d_backward_fn, slow_conv_dilated3d_backward_stub); +using slow_conv_transpose2d_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, std::array); +DECLARE_DISPATCH(slow_conv_transpose2d_backward_fn, slow_conv_transpose2d_backward_stub); +using slow_conv_transpose3d_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, std::array); +DECLARE_DISPATCH(slow_conv_transpose3d_backward_fn, slow_conv_transpose3d_backward_stub); + +namespace { + static bool cudnnv8_heuristic_mode_b = c10::utils::check_env("TORCH_CUDNN_USE_HEURISTIC_MODE_B") == true; +} + +static inline bool cudnnv8_enabled_check_debug() { + static bool cudnnv8_flag = c10::utils::check_env("TORCH_CUDNN_V8_API_DISABLED") != true; + static bool cudnnv8_debug = c10::utils::check_env("TORCH_CUDNN_V8_API_DEBUG") == true; + static uint8_t cudnnv8_debugcount = 0; + if (cudnnv8_debug == 1 && cudnnv8_debugcount < 10) { + TORCH_WARN("TORCH_CUDNN_V8_DEBUG ON, V8 ON: ", cudnnv8_flag, " TORCH_CUDNN_USE_HEURISTIC_MODE B: ", cudnnv8_heuristic_mode_b); + cudnnv8_debugcount++; + } + return cudnnv8_flag == 1; +} + +static inline bool cudnnv8_use_heur_mode_b() { + return cudnnv8_heuristic_mode_b; +} + +// Keep in sync with py::enum_ in Module.cpp +enum class ConvBackend { + CudaDepthwise2d, + CudaDepthwise3d, + Cudnn, + CudnnTranspose, + Empty, + Miopen, + MiopenDepthwise, + MiopenTranspose, + Mkldnn, + MkldnnTranspose, + MkldnnEmpty, + NnpackSpatial, + Overrideable, + Slow2d, + Slow3d, + SlowDilated2d, + SlowDilated3d, + SlowTranspose2d, + SlowTranspose3d, + Winograd3x3Depthwise, + Xnnpack2d, + Mps, + MpsTranspose, +}; + +// Overload for selecting the convolution backend from the full set of convolution inputs. +// This overload is exposed to python for testing, etc. +TORCH_API ConvBackend select_conv_backend( + const Tensor& input, const Tensor& weight, const c10::optional& bias_opt, + SymIntArrayRef stride, SymIntArrayRef padding, SymIntArrayRef dilation, + bool transposed, SymIntArrayRef output_padding, c10::SymInt groups, const at::OptionalSymIntArrayRef bias_sizes_opt); + +TORCH_API at::MemoryFormat _determine_backend_memory_format(const Tensor& input, + const Tensor& weight, + const ConvBackend backend); + +// --------------------------------------------------------------------- +// +// Math +// +// --------------------------------------------------------------------- + +constexpr int input_batch_size_dim = 0; // also grad_input +constexpr int input_channels_dim = 1; +constexpr int output_batch_size_dim = 0; // also grad_output +constexpr int output_channels_dim = 1; +constexpr int weight_output_channels_dim = 0; +constexpr int weight_input_channels_dim = 1; + +// Often written as 2 + max_dim (extra dims for batch size and channels) +constexpr int max_dim = 3; + +// --------------------------------------------------------------------- +// +// Checking +// +// --------------------------------------------------------------------- + +// Used on pad, stride and dilation +static void check_args(CheckedFrom c, IntArrayRef args, size_t expected_size, const char* arg_name) +{ + TORCH_CHECK(args.size() <= expected_size, + "Too many ", arg_name, " values (", args.size(), ") supplied, expecting ", + expected_size, " (while checking arguments for ", c, ")"); + TORCH_CHECK(args.size() >= expected_size, + "Not enough ", arg_name, " values (", args.size(), ") supplied, expecting ", + expected_size, " (while checking arguments for ", c, ")"); + + auto num_negative_values = std::count_if(args.begin(), args.end(), [](int x){return x < 0;}); + if (num_negative_values > 0){ + std::stringstream ss; + ss << arg_name << " should be greater than zero but got ("; + std::copy(args.begin(), args.end() - 1, std::ostream_iterator(ss,", ")); + ss << args.back() << ")" << " (while checking arguments for " << c << ")"; + AT_ERROR(ss.str()); + } +} + + +// NOTE [ Convolution checks ] +// +// NB: For many call sites, it is not strictly necessary to check all of +// these relationships (for example, for forward convolution, we compute +// the size of output ourselves, so we don't actually need to check +// output. However, writing a single function that does everything +// means we get to reuse it for both forwards and all backwards +// variants, even when the set of "real" inputs varies. The magic of +// relational computing! +// +// (There is one downside, which is that it is slightly harder to write +// error messages which are able to distinguish between real inputs +// (which the user can change) and computed inputs (which the user can +// only indirectly affect). It would be an interesting exercise to +// come up with a general framework to handle such situations.) +static void convolution_shape_check( + CheckedFrom c, + const TensorGeometryArg& input, const TensorGeometryArg& weight, const TensorGeometryArg& output, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups) +{ + check_args(c, padding, input->dim() - 2, "padding"); + check_args(c, stride, padding.size(), "stride"); + check_args(c, dilation, padding.size(), "dilation"); + + // Input + checkDimRange(c, input, 3, 6 /* exclusive */); + checkSize_symint(c, input, input_channels_dim, weight->size(1) * groups); + + // Weight + checkSameDim(c, input, weight); + + // TODO: check that output->size() matches output_sizes + // TODO: check that weight matches output->sizes() + checkSameDim(c, input, output); +} + +// NB: conv_output_size and conv_input_size are not bijections, +// as conv_output_size loses information; this is why conv_input_size +// takes an extra output_padding argument to resolve the ambiguity. + +template +static inline std::vector _conv_output_size( + ArrayRef input_size, ArrayRef weight_size, + ArrayRef padding, ArrayRef stride, ArrayRef dilation = ArrayRef() +) { + // ASSERT(input_size.size() > 2) + // ASSERT(input_size.size() == weight_size.size()) + bool has_dilation = !dilation.empty(); + auto dim = input_size.size(); + std::vector output_size(dim); + output_size[0] = input_size[input_batch_size_dim]; + output_size[1] = weight_size[weight_output_channels_dim]; + for (const auto d : c10::irange(2, dim)) { + auto dilation_ = has_dilation ? dilation[d - 2] : 1; + auto kernel = dilation_ * (weight_size[d] - 1) + 1; + output_size[d] = (input_size[d] + (2 * padding[d - 2]) - kernel) / stride[d - 2] + 1; + } + return output_size; +} + +static inline std::vector conv_output_size( + IntArrayRef input_size, IntArrayRef weight_size, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation = IntArrayRef() +) { + return _conv_output_size(input_size, weight_size, padding, stride, dilation); +} + +static inline std::vector conv_output_size( + SymIntArrayRef input_size, SymIntArrayRef weight_size, + SymIntArrayRef padding, SymIntArrayRef stride, SymIntArrayRef dilation = SymIntArrayRef() +) { + return _conv_output_size(input_size, weight_size, padding, stride, dilation); +} + +template +std::vector _conv_input_size( + ArrayRef output_size, ArrayRef weight_size, + ArrayRef padding, ArrayRef output_padding, ArrayRef stride, ArrayRef dilation, T groups +) { + // ASSERT(output_size.size() > 2) + // ASSERT(output_size.size() == weight_size.size()) + auto dim = output_size.size(); + std::vector input_size(dim); + input_size[0] = output_size[output_batch_size_dim]; + input_size[1] = weight_size[weight_input_channels_dim] * groups; + for (const auto d : c10::irange(2, dim)) { + auto kernel = (weight_size[d] - 1) * dilation[d - 2] + 1; + input_size[d] = (output_size[d] - 1) * stride[d - 2] - (padding[d - 2] * 2) + + kernel + output_padding[d - 2]; + } + return input_size; +} + +static inline std::vector conv_input_size( + SymIntArrayRef output_size, SymIntArrayRef weight_size, + SymIntArrayRef padding, SymIntArrayRef output_padding, SymIntArrayRef stride, SymIntArrayRef dilation, c10::SymInt groups +) { + return _conv_input_size(output_size, weight_size, padding, output_padding, stride, dilation, groups); +} + +static inline std::vector conv_input_size( + IntArrayRef output_size, IntArrayRef weight_size, + IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups +) { + return _conv_input_size(output_size, weight_size, padding, output_padding, stride, dilation, groups); +} + +template +std::vector _conv_weight_size( + ArrayRef input_size, ArrayRef output_size, + ArrayRef padding, ArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups +) { + auto dim = input_size.size(); + std::vector weight_size(dim); + weight_size[0] = output_size[1]; + weight_size[1] = input_size[1] / groups; + for (const auto d : c10::irange(2, dim)) { + auto kernel = input_size[d] - (output_size[d] - 1) * stride[d - 2] + + padding[d - 2] * 2 - output_padding[d - 2]; + weight_size[d] = (kernel - 1) / dilation[d - 2] + 1; + } + return weight_size; +} + +static inline std::vector conv_weight_size( + SymIntArrayRef input_size, SymIntArrayRef output_size, + SymIntArrayRef padding, SymIntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups +) { + return _conv_weight_size(input_size, output_size, padding, output_padding, stride, dilation, groups); +} + +static inline std::vector conv_weight_size( + IntArrayRef input_size, IntArrayRef output_size, + IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups +) { + return _conv_weight_size(input_size, output_size, padding, output_padding, stride, dilation, groups); +} + +static inline Tensor reshape_bias(int64_t dim, const Tensor& bias) { + std::vector shape(dim, 1); + shape[1] = -1; + return bias.reshape(shape); +} + +static inline at::MemoryFormat cudnn_conv_suggest_memory_format(const at::Tensor& input, const at::Tensor& weight) { + // disable NHWC for float64 input. + if (!at::detail::getCUDAHooks().compiledWithCuDNN() || + input.scalar_type() == at::kDouble || + weight.scalar_type() == at::kDouble) { + return at::MemoryFormat::Contiguous; + } + long cudnn_version = at::detail::getCUDAHooks().versionCuDNN(); + auto input_memory_format = input.suggest_memory_format(); + auto weight_memory_format = weight.suggest_memory_format(); + auto weight_ndim = weight.ndimension(); + + bool can_use_cudnn_channels_last_2d = (cudnn_version >= 7603) && (weight_ndim == 4) && ( + (input_memory_format == at::MemoryFormat::ChannelsLast) || + (weight_memory_format == at::MemoryFormat::ChannelsLast) + ); + if (can_use_cudnn_channels_last_2d) { + return at::MemoryFormat::ChannelsLast; + } + + bool can_use_cudnn_channels_last_3d = (cudnn_version >= 8005) && (weight_ndim == 5) && ( + (input_memory_format == at::MemoryFormat::ChannelsLast3d) || + (weight_memory_format == at::MemoryFormat::ChannelsLast3d) + ); + if (can_use_cudnn_channels_last_3d) { + return at::MemoryFormat::ChannelsLast3d; + } + + return at::MemoryFormat::Contiguous; +} + +// controls whether emptyCache will be called following cudnn conv benchmarking +TORCH_API void _cudnn_set_conv_benchmark_empty_cache(bool enable); +TORCH_API bool _cudnn_get_conv_benchmark_empty_cache(); + + +static inline bool miopen_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) { + + // disable NHWC for float64 input. + if (!at::detail::getCUDAHooks().compiledWithMIOpen() || + input.scalar_type() == at::kDouble || + weight.scalar_type() == at::kDouble) { + return false; + } + + bool can_use_miopen_channels_last_2d = false; +#if defined(USE_ROCM) && (ROCM_VERSION >= 40300) + // TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC once ROCm officially supports NHWC in MIOpen + // See #64427 + static c10::optional PYTORCH_MIOPEN_SUGGEST_NHWC = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC"); + + auto input_memory_format = input.suggest_memory_format(); + auto weight_memory_format = weight.suggest_memory_format(); + + can_use_miopen_channels_last_2d = PYTORCH_MIOPEN_SUGGEST_NHWC && *PYTORCH_MIOPEN_SUGGEST_NHWC && ( + ( (input_memory_format == at::MemoryFormat::ChannelsLast) || + (weight_memory_format == at::MemoryFormat::ChannelsLast) ) + ); +#endif + + bool can_use_miopen_channels_last_3d = false; + + return can_use_miopen_channels_last_2d || can_use_miopen_channels_last_3d; +} + +static inline bool mkldnn_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) { + + // disable NHWC for float64 input. + if (input.scalar_type() == at::kDouble || + weight.scalar_type() == at::kDouble) { + return false; + } + + // disable NHWC for MkldnnCPU tensor. + if (input.is_mkldnn() || weight.is_mkldnn()) { + return false; + } + + auto input_memory_format = input.suggest_memory_format(); + auto weight_memory_format = weight.suggest_memory_format(); + + bool can_use_mkldnn_channels_last_2d = + (input_memory_format == at::MemoryFormat::ChannelsLast) || + (weight_memory_format == at::MemoryFormat::ChannelsLast); + + bool can_use_mkldnn_channels_last_3d = + (input_memory_format == at::MemoryFormat::ChannelsLast3d) || + (weight_memory_format == at::MemoryFormat::ChannelsLast3d); + + return can_use_mkldnn_channels_last_2d || can_use_mkldnn_channels_last_3d; +} + +static inline bool thnn_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) { + + auto input_memory_format = input.suggest_memory_format(); + auto weight_memory_format = weight.suggest_memory_format(); + + bool can_use_thnn_channels_last_2d = input.device().is_cpu() && ( + (input_memory_format == at::MemoryFormat::ChannelsLast) || ( + weight_memory_format == at::MemoryFormat::ChannelsLast)); + + return can_use_thnn_channels_last_2d; +} + +static inline bool xpu_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) { + + // check layout only for xpu tensor. + if (!input.is_xpu() || !weight.is_xpu()) { + return false; + } + + // disable NHWC for float64 input. + if (input.scalar_type() == at::kDouble || + weight.scalar_type() == at::kDouble) { + return false; + } + + auto input_memory_format = input.suggest_memory_format(); + auto weight_memory_format = weight.suggest_memory_format(); + + bool can_use_xpu_channels_last_2d = + (input_memory_format == at::MemoryFormat::ChannelsLast) || + (weight_memory_format == at::MemoryFormat::ChannelsLast); + + bool can_use_xpu_channels_last_3d = + (input_memory_format == at::MemoryFormat::ChannelsLast3d) || + (weight_memory_format == at::MemoryFormat::ChannelsLast3d); + + return can_use_xpu_channels_last_2d || can_use_xpu_channels_last_3d; +} + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Cross.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Cross.h new file mode 100644 index 0000000000000000000000000000000000000000..9daee7f2d6c43586630ad38ac731b22ce2416ff3 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Cross.h @@ -0,0 +1,14 @@ +#pragma once + +#include + +namespace at { +class Tensor; + +namespace native { + +using cross_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const int64_t d); + +DECLARE_DISPATCH(cross_fn, cross_stub); + +}} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/DistributionTemplates.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/DistributionTemplates.h new file mode 100644 index 0000000000000000000000000000000000000000..a5ed9526c270dcedd93aba16ab28854d94b584fb --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/DistributionTemplates.h @@ -0,0 +1,394 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include +#include +#include +#endif + +namespace at::native::templates { + +// ==================================================== Random ======================================================== + +// The purpose of `update_from` and `update_to` is to find the closest valid int64_t number that can be used as actual `from`. +// The current implementation of `random_` uses uint64_t arithmetics and casts the result to the target dtype(scalar_t). +// This casting can result in generating numbers that happen to be greater or equal to `to` value. For instance: +// +// auto actual = torch::empty({3, 3}, torch::half); +// actual.random_(0, 65504); +// +// If random's uint64_t arithmetics produces 65503 as a random value after casting to torch::half it becomes 65504 +// and violates the requirement that random value must be less than `to`. To resolve this issue `update_from` and `update_to` +// moves `from` to the right and `to` to the left to the next closest value that won't go outside [from, to) after casting to +// the target dtype. For `to` = 65504 it moves left for (1 << (log2(to) - 11 + 1)) = 32 and becomes 65472, which is previous +// available number for torch::half dtype. +template +int64_t update_from(int64_t from) { + static_assert( + std::is_floating_point::value || + std::is_same::value || + std::is_same::value, "scalar_t must be floating-point type"); + const auto from_plus_1 = static_cast(static_cast(from + 1)); + if (from_plus_1 < from) { + int64_t from_ = std::abs(from + 1); + int n = 0; + while (from_ >>= 1) ++n; + // NOLINTNEXTLINE(clang-analyzer-core.UndefinedBinaryOperatorResult) + from = from_plus_1 + (1LL << (n - std::numeric_limits::digits + 1)); + } + return from; +} + +template +int64_t update_to(int64_t to) { + static_assert( + std::is_floating_point::value || + std::is_same::value || + std::is_same::value, "scalar_t must be floating-point type"); + const auto to_minus_1 = static_cast(static_cast(to - 1)); + if (to_minus_1 >= to) { + int64_t to_ = std::abs(to - 1); + int n = 0; + while (to_ >>= 1) ++n; + // NOLINTNEXTLINE(clang-analyzer-core.UndefinedBinaryOperatorResult) + to = to_minus_1 - (1LL << (n - std::numeric_limits::digits + 1)); + } + return to; +} + +// Return earlier for not invoking kernel. +// See https://github.com/pytorch/pytorch/issues/103418 for more details +#define CHECK_EMPTY_AND_RETURN(tensor) \ + if (tensor.numel() == 0) { \ + return tensor; \ + } + +template class random_kernel, typename RNG> +at::Tensor& random_impl(at::Tensor& self, c10::optional generator) { + CHECK_EMPTY_AND_RETURN(self); + auto iter = at::TensorIterator::borrowing_nullary_op(self); + random_kernel()(iter, generator); + return self; +} + +#define CHECK_OUT_OF_BOUNDS(var, name, min, max, dtype) \ + TORCH_CHECK(var >= min && var <= max, name , " is out of bounds for ", dtype); \ + +#define WARN_OUT_OF_BOUNDS(var, name, digits, dtype) \ + if (var < -(1LL << digits) || var > (1LL << digits)) { \ + TORCH_WARN(name , " is out of bounds [-(2^", digits, "), 2^", digits, "]. ", \ + "Due to precision limitations ", dtype, " can support discrete uniform distribution only within this range. ", \ + "This warning will become an error in version 1.7 release, please fix the code in advance"); \ + } + +static void check_from_to_in_range(int64_t from, int64_t to_inc, caffe2::TypeMeta dtype) { + const auto scalar_type = typeMetaToScalarType(dtype); + if (isFloatingType(scalar_type)) { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, scalar_type, "check_random_fp_bounds", [&] { + const auto min = static_cast(std::numeric_limits::lowest()); + const auto max = static_cast(std::numeric_limits::max()); + CHECK_OUT_OF_BOUNDS(from, "from", min, max, dtype); + CHECK_OUT_OF_BOUNDS(to_inc, "to - 1", min, max, dtype); + + constexpr auto digits = std::numeric_limits::digits; + WARN_OUT_OF_BOUNDS(from, "from", digits, dtype); + WARN_OUT_OF_BOUNDS(to_inc, "to - 1", digits, dtype); + }); + } else if (scalar_type == kUInt64) { + // When you do a comparison between int64_t and uint64_t, the usual + // arithmetic conversions say that the int64_t value is promoted to + // unsigned. But this conversion wraps around: if I had -1 as my int64_t, + // then it will promote to 0xFFFFFFFFFFFFFFFF in uint64_t. This is never + // the right thing to do. + CHECK_OUT_OF_BOUNDS(from, "from", 0, INT64_MAX, dtype); + CHECK_OUT_OF_BOUNDS(to_inc, "to - 1", 0, INT64_MAX, dtype); + } else if (isIntegralType(scalar_type, /*includeBool=*/true)) { + AT_DISPATCH_V2(scalar_type, "check_random_integral_bounds", AT_WRAP([&]() { + const auto min = static_cast(std::numeric_limits::lowest()); + const auto max = static_cast(std::numeric_limits::max()); + CHECK_OUT_OF_BOUNDS(from, "from", min, max, dtype); + CHECK_OUT_OF_BOUNDS(to_inc, "to - 1", min, max, dtype); + }), AT_EXPAND(AT_INTEGRAL_TYPES), kUInt16, kUInt32, kBool); + } else { + TORCH_CHECK(false, "check_random_bounds handles only integral, floating-point and boolean types"); + } +} + +template class random_from_to_kernel, typename RNG> +at::Tensor& random_from_to_impl(at::Tensor& self, int64_t from, c10::optional to_opt, c10::optional generator) { + uint64_t range = 0; + auto iter = at::TensorIterator::borrowing_nullary_op(self); + if (to_opt.has_value()) { + // [from, to) + int64_t to = *to_opt; + TORCH_CHECK(from < to, "random_ expects 'from' to be less than 'to', but got from=", from, " >= to=", to); + if (isFloatingType(iter.dtype())) { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "random_update_from_to", [&] { + from = update_from(from); + to = update_to(to); + TORCH_CHECK(from < to, "random_ expects 'from' casted to dtype to be less than 'to' casted to dtype, but got from=", from, " >= to=", to); + }); + } + check_from_to_in_range(from, to - 1, self.dtype()); + CHECK_EMPTY_AND_RETURN(self); + range = static_cast(to) - static_cast(from); + random_from_to_kernel()(iter, range, from, generator); + } else if (from != std::numeric_limits::lowest()) { + // [from, std::numeric_limits::max()] + int64_t to_inc = 0; + if (isFloatingType(iter.dtype())) { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "random_from_to_range_calc", [&] { + constexpr int64_t scalar_t_max = static_cast(1) << std::numeric_limits::digits; + to_inc = scalar_t_max > std::numeric_limits::max() ? std::numeric_limits::max() : static_cast(scalar_t_max); + from = update_from(from); + TORCH_CHECK(from < to_inc, "random_ expects 'from' casted to dtype to be less than or equal to 'to_inc' casted to dtype, but got from=", from, " > to_inc=", to_inc); + }); + } else if (isIntegralType(iter.dtype(), /*includeBool=*/true)) { + AT_DISPATCH_V2(self.scalar_type(), "random_from_to_range_calc", AT_WRAP([&] { + if constexpr (std::is_same_v) { + to_inc = static_cast(true); + } else { + to_inc = static_cast(std::numeric_limits::max()); + } + }), AT_EXPAND(AT_INTEGRAL_TYPES_V2), kBool); + } else { + TORCH_CHECK(false, "random_from_to_impl handles only integral, floating-point and boolean types"); + } + check_from_to_in_range(from, to_inc, self.dtype()); + CHECK_EMPTY_AND_RETURN(self); + range = static_cast(to_inc) - static_cast(from) + 1; + random_from_to_kernel()(iter, range, from, generator); + } else { + // [std::numeric_limits::lowest(), std::numeric_limits::max()] + // range = 2^64 + CHECK_EMPTY_AND_RETURN(self); + random_from_to_kernel()(iter, generator); + } + return self; +} + +// ==================================================== Normal ======================================================== + +#define CHECK_NORMAL_TENSOR_STD(std) \ + do { \ + TORCH_CHECK( \ + !std.is_complex(), \ + "normal expects standard deviation to be non-complex"); \ + TORCH_CHECK( \ + std.numel() == 0 || std.is_meta() || std.min().ge(0).item(), \ + "normal expects all elements of std >= 0.0"); \ + } while (0) + +#define CHECK_NORMAL_STD(std) \ + TORCH_CHECK(std >= 0.0, "normal expects std >= 0.0, but found std ", std); + +template class normal_kernel, typename RNG> +Tensor& normal_impl_(Tensor& self, double mean, double std, c10::optional gen) { + CHECK_NORMAL_STD(std); + CHECK_EMPTY_AND_RETURN(self); + + if (self.is_complex()) { + auto float_tensor = at::view_as_real(self); + // variance for normal distribution of the real and imaginary values + // is half of the input variance + normal_kernel()(float_tensor, mean, std/(std::sqrt(2)), gen); + } else { + normal_kernel()(self, mean, std, gen); + } + return self; +} + +template class normal_kernel, typename RNG> +Tensor& normal_out_impl(Tensor& output, const Tensor& mean, double std, c10::optional gen) { + CHECK_NORMAL_STD(std); + auto std_tensor = at::empty_like(output, MemoryFormat::Contiguous); + auto shape = at::infer_size(mean.sizes(), std_tensor.sizes()); + at::native::resize_output(output, shape); + normal_impl_(output, 0, std, gen); + output.add_(mean); + return output; +} + +template class normal_kernel, typename RNG> +Tensor& normal_out_impl(Tensor& output, double mean, const Tensor& std, c10::optional gen) { + CHECK_NORMAL_TENSOR_STD(std); + auto mean_tensor = at::full({}, mean, output.options()); + auto shape = at::infer_size(mean_tensor.sizes(), std.sizes()); + at::native::resize_output(output, shape); + normal_impl_(output, 0, 1, gen); + // CUDA NB: addcmul_out copies the tensor to be added into the output. + // The previous function here was addcmul_out(output, mean_tensor, output, std, 1); + // The third argument is not a constant reference and hence the samples in output are overwritten. + // Consequently, the computation performed is mean_tensor + mean_tensor * std instead of mean_tensor + output * std + output.mul_(std).add_(mean_tensor); + return output; +} + +template class normal_kernel, typename RNG> +Tensor& normal_out_impl(Tensor& output, const Tensor& mean, const Tensor& std, c10::optional gen) { + CHECK_NORMAL_TENSOR_STD(std); + auto shape = at::infer_size(mean.sizes(), std.sizes()); + at::native::resize_output(output, shape); + normal_impl_(output, 0, 1, gen); + // CUDA NB: addcmul_out copies the tensor to be added into the output. + // The previous function here was addcmul_out(output, mean, output, std, 1); + // The third argument is not a constant reference and hence the samples in output are overwritten. + // Consequently, the computation performed is mean + mean * std instead of mean + output * std + output.mul_(std).add_(mean); + return output; +} + +template class normal_kernel, typename RNG> +Tensor normal_impl(const Tensor& mean, double std, c10::optional gen) { + CHECK_NORMAL_STD(std); + Tensor ret = at::empty_like(mean, MemoryFormat::Contiguous); + normal_out_impl(ret, mean, std, gen); + return ret; +} + +template class normal_kernel, typename RNG> +Tensor normal_impl(double mean, const Tensor& std, c10::optional gen) { + CHECK_NORMAL_TENSOR_STD(std); + Tensor ret = at::empty_like(std, MemoryFormat::Contiguous); + normal_out_impl(ret, mean, std, gen); + return ret; +} + +template class normal_kernel, typename RNG> +Tensor normal_impl(const Tensor& mean, const Tensor& std, c10::optional gen) { + CHECK_NORMAL_TENSOR_STD(std); + auto shape = at::infer_size(mean.sizes(), std.sizes()); + Tensor ret = at::empty(shape, mean.options(), MemoryFormat::Contiguous); + normal_out_impl(ret, mean, std, gen); + return ret; +} + +// ==================================================== Uniform ======================================================= + +template class uniform_kernel, typename RNG> +at::Tensor& uniform_impl_(at::Tensor& self, double from, double to, c10::optional generator) { + if (self.is_complex()) { + CHECK_EMPTY_AND_RETURN(self); + auto float_tensor = at::view_as_real(self); + uniform_impl_(float_tensor, from, to, generator); + } else { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "check_uniform_bounds", [&] { + const auto dtype = self.dtype(); + const auto min = static_cast(std::numeric_limits::lowest()); + const auto max = static_cast(std::numeric_limits::max()); + CHECK_OUT_OF_BOUNDS(from, "from", min, max, dtype); + CHECK_OUT_OF_BOUNDS(to, "to", min, max, dtype); + TORCH_CHECK(from <= to, "uniform_ expects to return a [from, to) range, but found from=", from, " > to=", to); + TORCH_CHECK((to - from) <= std::numeric_limits::max(), + "uniform_ expects to-from <= std::numeric_limits<", toString(self.scalar_type()), + ">::max(), but found to=", to, " and from=", from, + " which result in to-from to exceed the limit"); + from = std::min(std::max(from, min), max); + to = std::max(std::min(to, max), min); + }); + CHECK_EMPTY_AND_RETURN(self); + auto iter = at::TensorIterator::borrowing_nullary_op(self); + uniform_kernel()(iter, from, to, generator); + } + return self; +} + +// ================================================== LogNormal ======================================================= + +template class log_normal_kernel, typename RNG> +at::Tensor& log_normal_impl_(at::Tensor& self, double mean, double std, c10::optional gen) { + TORCH_CHECK(std > 0.0, "log_normal_ expects std > 0.0, but found std=", std); + CHECK_EMPTY_AND_RETURN(self); + auto iter = TensorIterator::borrowing_nullary_op(self); + log_normal_kernel()(iter, mean, std, gen); + return self; +} + +// =================================================== Geometric ====================================================== + +template class geometric_kernel, typename RNG> +Tensor& geometric_impl_(Tensor& self, double p, c10::optional gen) { + TORCH_CHECK(0 < p && p < 1, "geometric_ expects p to be in (0, 1), but got p=", p); + CHECK_EMPTY_AND_RETURN(self); + auto iter = TensorIterator::borrowing_nullary_op(self); + geometric_kernel()(iter, p, gen); + return self; +} + +// ================================================== Exponential ===================================================== + +template class exponential_kernel, typename RNG> +Tensor& exponential_impl_(Tensor& self, double lambda, c10::optional gen) { + TORCH_CHECK(lambda > 0.0, "exponential_ expects lambda > 0.0, but found lambda=", lambda); + CHECK_EMPTY_AND_RETURN(self); + auto iter = TensorIterator::borrowing_nullary_op(self); + exponential_kernel()(iter, lambda, gen); + return self; +} + +// ==================================================== Cauchy ======================================================== + +template class cauchy_kernel, typename RNG> +Tensor& cauchy_impl_(Tensor& self, double median, double sigma, c10::optional gen) { + // TODO: instead of variable name 'sigma', use 'gamma' or 'scale' + // the variance, squared sigma, is undefined for cauchy distribution + TORCH_CHECK(sigma > 0.0, "cauchy_ expects sigma > 0.0, but found sigma=", sigma); + TORCH_CHECK(at::isFloatingType(self.scalar_type()), "Cauchy distribution is a continuous probability distribution. dtype must be a floating point but you specified ", self.dtype()); + CHECK_EMPTY_AND_RETURN(self); + auto iter = TensorIterator::borrowing_nullary_op(self); + cauchy_kernel()(iter, median, sigma, gen); + return self; +} + +// ==================================================== Bernoulli ===================================================== + +template class bernoulli_tensor_kernel, typename RNG> +Tensor& bernoulli_impl_(Tensor& self, const Tensor& p_, c10::optional gen) { + CHECK_EMPTY_AND_RETURN(self); + NoNamesGuard guard; + at::assert_no_internal_overlap(self); + bernoulli_tensor_kernel()(self, p_, gen); + return self; +} + +template class bernoulli_scalar_kernel, typename RNG> +Tensor& bernoulli_impl_(Tensor& self, double p, c10::optional gen) { + TORCH_CHECK(0 <= p && p <= 1, "bernoulli_ expects p to be in [0, 1], but got p=", p); + CHECK_EMPTY_AND_RETURN(self); + at::assert_no_internal_overlap(self); + bernoulli_scalar_kernel()(self, p, gen); + return self; +} + +template class bernoulli_tensor_kernel, typename RNG> +Tensor& bernoulli_out_impl(Tensor& result, const Tensor& self, c10::optional gen) { + // result.resize_as_(self) requires self to have same dtype as result, so we + // use resize_ instead. + // TODO: Fix resize_as_. See pytorch/pytorch#11665. + result.resize_(self.sizes()); + bernoulli_impl_(result, self, gen); + namedinference::propagate_names(result, self); + return result; +} + +#undef CHECK_OUT_OF_BOUNDS +#undef WARN_OUT_OF_BOUNDS + +} // namespace at::native::templates diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Histogram.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Histogram.h new file mode 100644 index 0000000000000000000000000000000000000000..cd19fa4691ad04ea0b6e3aeea2cafe9eca907dd4 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Histogram.h @@ -0,0 +1,16 @@ +#pragma once + +#include +#include + +namespace at::native { + +using histogramdd_fn = void(*)(const Tensor&, const c10::optional&, bool, Tensor&, const TensorList&); +using histogramdd_linear_fn = void(*)(const Tensor&, const c10::optional&, bool, Tensor&, const TensorList&, bool); +using histogram_select_outer_bin_edges_fn = void(*)(const Tensor& input, const int64_t N, std::vector &leftmost_edges, std::vector &rightmost_edges); + +DECLARE_DISPATCH(histogramdd_fn, histogramdd_stub); +DECLARE_DISPATCH(histogramdd_linear_fn, histogramdd_linear_stub); +DECLARE_DISPATCH(histogram_select_outer_bin_edges_fn, histogram_select_outer_bin_edges_stub); + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/IndexKernel.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/IndexKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..8b0a787f0e87a64bced49d9c920ebd66262bd26a --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/IndexKernel.h @@ -0,0 +1,41 @@ +#pragma once +#include +#include + +namespace at { +class Tensor; +class TensorBase; +struct TensorIterator; +struct TensorIteratorBase; +} + +namespace c10 { +class Scalar; +} + +namespace at::native { + +using index_fn = void(*)(TensorIteratorBase &, IntArrayRef indexed_sizes, IntArrayRef indexed_strides); +using index_fill_fn = void(*)(TensorIterator & iter, int64_t dim, int64_t self_dim_size, int64_t self_dim_stride, const Scalar& source); +using index_copy_fn = void(*)(TensorIterator & iter, int64_t dim, int64_t self_dim_size, int64_t self_dim_stride); +using index_put_fn = void(*)(TensorIterator &, IntArrayRef indexed_sizes, IntArrayRef indexed_strides, bool accumulate); +using put_fn = void(*)(TensorIterator & iter, const TensorBase& self, const bool accumulate); +using take_fn = void(*)(TensorIterator & iter, const TensorBase& input); +using flip_fn = void(*)(TensorIterator &, const bool); +using masked_fill_fn = void(*)(TensorIterator &, const Scalar& scalar); +using masked_select_fn = void(*)(TensorIterator &, int64_t orig_stride); +using masked_scatter_fn = void(*)(TensorIterator &, const TensorBase &); + +DECLARE_DISPATCH(index_fn, index_stub); +DECLARE_DISPATCH(index_fill_fn, index_fill_stub); +DECLARE_DISPATCH(index_copy_fn, index_copy_stub); +DECLARE_DISPATCH(index_put_fn, index_put_stub); +DECLARE_DISPATCH(put_fn, put_stub); +DECLARE_DISPATCH(take_fn, take_stub); +DECLARE_DISPATCH(flip_fn, flip_stub); +DECLARE_DISPATCH(masked_fill_fn, masked_fill_stub); +DECLARE_DISPATCH(masked_select_fn, masked_select_serial_stub); +DECLARE_DISPATCH(masked_select_fn, masked_select_stub); +DECLARE_DISPATCH(masked_scatter_fn, masked_scatter_stub); + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/IndexingUtils.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/IndexingUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..72b39eb326a0c1b6dafc11e047bd62c8420e2cd0 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/IndexingUtils.h @@ -0,0 +1,160 @@ +#pragma once +#include +#include +#include +#include +#include + +namespace at::native { + +[[noreturn]] +static void invalid_mask(const Tensor & self, int64_t idx, const Tensor & mask, int64_t maskIdx) { + TORCH_CHECK_INDEX(false, "The shape of the mask ", mask.sizes(), " at index ", maskIdx, + " does not match the shape of the indexed tensor ", self.sizes(), " at index ", idx); +} + + +static C10_UNUSED std::vector expandTensors(const Tensor & self, IOptTensorListRef indices) { + // If indices come in as ByteTensor or BoolTensor (masks), expand them into the equivalent indexing by LongTensors + std::vector result; + for (const auto& index_opt : indices) { + if (!index_opt.has_value()) { + result.emplace_back(); + } else { + const auto& index = *index_opt; + if (index.scalar_type() == kByte || index.scalar_type() == kBool) { + if (index.scalar_type() == kByte) { + TORCH_WARN("indexing with dtype torch.uint8 is now deprecated," \ + " please use a dtype torch.bool instead."); + } + // The sizes of the ByteTensor mask or bool tensor must match the sizes of the + // corresponding dimensions in self + for (const auto j : c10::irange(index.dim())) { + int64_t srcIdx = static_cast(result.size() + j); + if (index.size(j) != self.size(srcIdx)) { + invalid_mask(self, srcIdx, index, j); + } + } + // Replace with nonzeros + auto nonzero = index.nonzero(); + for (const auto j : c10::irange(index.dim())) { + result.emplace_back(nonzero.select(1, j)); + } + } else { + result.emplace_back(index); + } + } + } + return result; +} + +static C10_UNUSED void checkIndexTensorTypes(IOptTensorListRef indices, bool allow_int=false) { + for (const auto& tensor : indices) { + if (tensor.has_value() && tensor->defined()) { + auto scalarType = tensor->scalar_type(); + if (allow_int) { + if (scalarType != kLong && scalarType != kByte && scalarType != kBool && scalarType != kInt) { + TORCH_CHECK_INDEX(false, "tensors used as indices must be long, int, byte or bool tensors"); + } + } else { + if (scalarType != kLong && scalarType != kByte && scalarType != kBool) { + TORCH_CHECK_INDEX(false, "tensors used as indices must be long, byte or bool tensors"); + } + } + } + } +} + +inline torch::List> toListOfOptionalTensors(ArrayRef list) { + torch::List> result; + result.reserve(list.size()); + for (const Tensor& a : list) { + result.push_back(a); + } + return result; +} + +inline torch::List> toListOfOptionalTensors(ArrayRef list) { + torch::List> result; + result.reserve(list.size()); + for (const IValue& a : list) { + result.push_back(a.isTensor() ? c10::optional(a.toTensor()) : c10::optional()); + } + return result; +} + +static C10_UNUSED bool hasContiguousSubspace(TensorList tl) { + // true if all the non-null tensors are adjacent + auto isDefined = [](const Tensor & tensor){ return tensor.defined(); }; + auto isNull = [](const Tensor & tensor){ return !tensor.defined(); }; + auto start = std::find_if(tl.begin(), tl.end(), isDefined); + auto stop = std::find_if(tl.rbegin(), tl.rend(), isDefined); + auto it = std::find_if(start, stop.base(), isNull); + return it == stop.base(); +} + + +// Transposes the tensor and indices together so that all the non-null indices +// index the first k dimensions of the tensor. Returns the transposed tensor +// and the reordered indices. For example: +// transposeToFront(tensor, {nullptr, a, nullptr, b}) +// returns +// tensor.permute([1, 3, 0, 2]), {a, b, nullptr, nullptr} +static C10_UNUSED std::tuple> +transposeToFront(const Tensor& self, TensorList indices) { + std::vector dims; + std::vector transposedIndices; + dims.reserve(self.dim()); + for (const auto i : c10::irange(self.dim())) { + if (indices[i].defined()) { + dims.push_back(i); + transposedIndices.emplace_back(indices[i]); + } + } + for (const auto i : c10::irange(self.dim())) { + if (!indices[i].defined()) { + dims.push_back(i); + transposedIndices.emplace_back(); + } + } + return std::make_tuple(self.permute(dims), std::move(transposedIndices)); +} + +inline std::tuple, std::vector> +transposeToFrontAndInvPerm(const Tensor& self, TensorList indices) { + std::vector dims; + std::vector invPerm; + std::vector transposedIndices; + dims.reserve(self.dim()); + invPerm.resize(self.dim()); + for (const auto i : c10::irange(self.dim())) { + if (indices[i].defined()) { + dims.push_back(i); + transposedIndices.emplace_back(indices[i]); + } + } + for (const auto i : c10::irange(self.dim())) { + if (!indices[i].defined()) { + dims.push_back(i); + transposedIndices.emplace_back(); + } + } + for (const auto i : c10::irange(self.dim())) { + invPerm[dims[i]] = i; + } + return std::make_tuple(self.permute(dims), std::move(transposedIndices), std::move(invPerm)); +} + +struct AdvancedIndex { + AdvancedIndex(const Tensor& src, TensorList indices); + + Tensor src; + std::vector indices; + DimVector indexed_sizes; + DimVector indexed_strides; + int64_t dims_before; + int64_t dims_after; +}; + + +} //namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/MathBitsFallback.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/MathBitsFallback.h new file mode 100644 index 0000000000000000000000000000000000000000..584d07aeca358b2164f97d705dbaf172868f8a94 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/MathBitsFallback.h @@ -0,0 +1,157 @@ +#include +#include +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include + +#include +#endif + +namespace at::native { +// This fallback should only be used for operations that are self inverse and have a corresponding tensor +// bit (internally implemented using DispatchKey) to maintain the state on tensor using tensor bit. +// Currently there are two tensor bits that trigger this fallback: conjugate bit and negative bit. +// Conjugate bit is set on a tensor when `.conj()` is called and neg bit is set on a tensor when `.conj().imag` is called. + +// NOTE: To use this fallback, `clone` and `copy_` should fully understand and be able to correctly handle the semantic of your math bit. +struct MathOpFallback { + MathOpFallback(DispatchKey key_, string op_name_) : key(key_), op_name(std::move(op_name_)) {} + virtual bool is_bit_set(const Tensor&) = 0; + void fallback_impl(const c10::OperatorHandle& op, DispatchKeySet dispatch_keys, torch::jit::Stack* stack) { + /* + Situations to handle: + 1. Out-of-place operation. Easy: materialize all inputs and + call it a day. + 2. Inplace operation. Desugar x.add_(2) into x.conj_().add_(2).conj_(). + Materialize other inputs as in (1). + 3. out= operation. Desugar add(x, 2, out=y) into y.copy_(add(x, 2)) + Materialize other inputs as in (1). + + It is important to be able to tell if we READ from an argument and if we + WRITE to an argument. Conservative approach is to assume that we always + READ from an argument, but in out= operations you can skip + conjugating inputs on entry that never get used. In the current schema we + can't easily tell if the operation is in in-place or out= operation. + + Note: + 1. Mutable tensorlists containing tensors whose math bit set to true are disallowed. + 2. Mutable tensors with math bit set to true are unconditionally cloned to ensure + correct behavior in the case when the mutable tensor shares memory with non mutable arguments. + + If we were to in-place resolve the math bit for mutable inputs, then the non-mutable inputs sharing partial or full memory + with these mutable inputs would read into wrong values in the following cases: + 1. Non mutable inputs have their math bit set to false. + 2. Math bit for mutable input(s) is resolved before the non mutable inputs (with bit set to true and sharing memory + with one or more mutable arg(s)) are cloned. + At the end, the final value of the mutable arguments from the stack are copied into the original input mutable tensor inputs. + */ + const auto& arguments = op.schema().arguments(); + const auto num_arguments = arguments.size(); + const auto stack_start = stack->size() - num_arguments; + + c10::optional is_write; + for (const auto i : c10::irange(num_arguments)) { + // Three possible states: + // 1. alias_info has no value --> out-of-place operation + // 2. alias_info does have a value, alias_info->is_write=True --> in-place or out= operation + // 3. alias_info does have a value, alias_info->is_write=False --> view operation + const AliasInfo* alias_info = arguments[i].alias_info(); + if (alias_info != nullptr) { + if (is_write.has_value()) { + TORCH_CHECK(*is_write == alias_info->isWrite(), + "Unsupported operator for ", op_name, " fallback: ", op.schema().name(), + op_name, " fallback doesn't work for operators with a mix " + "mutable and non-mutable inputs that alias with outputs, " + "this must be implemented manually. " + "If you got this error on a core op, please report a bug to PyTorch."); + } else { + is_write = alias_info->isWrite(); + } + } + } + + if (is_write.has_value() && !*is_write) { + // We assume that view operators automatically handle the math bit + // correctly by propagating the dispatch key in key_set. + // This is not necessarily always right, so you should test these cases. + op.redispatchBoxed(dispatch_keys & c10::DispatchKeySet(DispatchKeySet::FULL_AFTER, key), stack); + return; + } + + // Mutable inputs with math bit set to True and their clones + std::vector> mutable_inputs_with_their_clones; + for (const auto i : c10::irange(num_arguments)) { + auto& ivalue = (*stack)[stack_start + i]; + if (!(ivalue.isTensor() || ivalue.isTensorList())) { + continue; + } + const auto& argument = arguments[i]; + bool mut_arg = false; + if (argument.alias_info()) { + // Was already tested by is_write loop above + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(argument.alias_info()->isWrite()); + mut_arg = true; + } + if (ivalue.isTensor()) { + if (!is_bit_set(ivalue.toTensor())) { + continue; + } + auto tensor = std::move(ivalue).toTensor(); + auto resolved_tensor = at::clone(tensor); + if (mut_arg) { + TORCH_CHECK(mutable_inputs_with_their_clones.empty(), op_name, " fallback does not support operators with more than one mutable tensors with ", + op_name, "bit set to true."); + mutable_inputs_with_their_clones.emplace_back(std::move(tensor), resolved_tensor); + } + (*stack)[stack_start + i] = std::move(resolved_tensor); + } else if (ivalue.isTensorList()) { + auto tensors = std::move(ivalue).toTensorList(); + for(const auto j : c10::irange(tensors.size())) { + const auto& tensor = tensors[j]; + if (!is_bit_set(tensor)) { + continue; + } + TORCH_CHECK(!mut_arg, " fallback doesn't currently support mutable TensorLists with ", + op_name, " inputs. Please materialize all the ", op_name, " input tensor(s) in the mutable TensorList inputs before calling ", + op.schema().name()); + tensors[j] = at::clone(tensor); + } + (*stack)[stack_start + i] = std::move(tensors); + } + } + + op.redispatchBoxed(dispatch_keys & c10::DispatchKeySet(DispatchKeySet::FULL_AFTER, key), stack); + + TORCH_INTERNAL_ASSERT(mutable_inputs_with_their_clones.size() <= 1); + + for (std::pair mut_tensors: mutable_inputs_with_their_clones) { + auto& mutable_input = mut_tensors.first; + auto& cloned_mutable_input = mut_tensors.second; + auto& ivalue = (*stack)[stack_start]; + auto returned_output = std::move(ivalue).toTensor(); + + // sanity check to ensure that the tensor in stack aliases the cloned_mutable_input + TORCH_INTERNAL_ASSERT(cloned_mutable_input.is_same(returned_output)); + + // necessary for out= arg + at::native::resize_output(mutable_input, returned_output.sizes()); + + mutable_input.copy_(returned_output); + (*stack)[stack_start] = std::move(mutable_input); + } + } + + virtual ~MathOpFallback() = default; + + DispatchKey key; + string op_name; +}; + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/MaxPooling.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/MaxPooling.h new file mode 100644 index 0000000000000000000000000000000000000000..3c6760ca6886679c25b13aa97706def39c25382d --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/MaxPooling.h @@ -0,0 +1,97 @@ +#pragma once + +#include +#include +#include +#include + +namespace at::native { + +static void check_max_pool1d( + const Tensor& self, + IntArrayRef kernel_size, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + bool ceil_mode) { + + TORCH_CHECK( + self.dim() == 2 || self.dim() == 3, + "max_pool1d() Expected 2D or 3D input tensor, but got ", self.sym_sizes()); + TORCH_CHECK( + kernel_size.size() == 1, + "max_pool1d() kernel_size must be an int, list of ints or tuple of ints of size 1 but got size ", + kernel_size.size()); + TORCH_CHECK( + stride.empty() || stride.size() == 1, + "max_pool1d() stride must be None, an int, list of ints, or tuple of ints of size 1 but got size ", + stride.size()); + TORCH_CHECK( + padding.size() == 1, + "max_pool1d() padding must be an int, list of ints, or tuple of ints of size 1 but got size ", + padding.size()); + TORCH_CHECK( + dilation.size() == 1, + "max_pool1d() dilation must be an int, list of ints or tuple of ints of size 1 but got size ", + dilation.size()); + + // If stride=None then set it to kernel_size + if (stride.empty()) { + stride = kernel_size; + } + + TORCH_CHECK( + kernel_size[0] > 0, + "max_pool1d() kernel_size must be greater than zero, but got ", + kernel_size[0]); + TORCH_CHECK( + stride[0] > 0, "max_pool1d() stride must be greater than zero, but got ", stride[0]); + TORCH_CHECK( + padding[0] >= 0, "max_pool1d() padding must be non-negative, but got ", padding[0]); + TORCH_CHECK( + padding[0] <= kernel_size[0] / 2, + "max_pool1d() padding should be at most half of kernel size, but got padding=", + padding[0], + " and kernel_size=", + kernel_size[0]); + TORCH_CHECK( + dilation[0] > 0, "max_pool1d() dilation must be greater than zero, but got ", dilation[0]); + + const int64_t OW = pooling_output_shape(self.sym_size(-1).guard_int(__FILE__, __LINE__), kernel_size[0], padding[0], stride[0], dilation[0], ceil_mode); + TORCH_CHECK(OW > 0, "max_pool1d() Invalid computed output size: ", OW); +} + +// TODO(Heitor) Template by dimension +struct PoolingParams1D { + int64_t NB; // Number of batches + int64_t NC; // Number of channels + int64_t IW; // Input width + int64_t OW; // Output width + int64_t KW; // Kernel width + int64_t SJ; // Column stride + int64_t PJ; // Column padding + int64_t DJ; // Column dilation + + // Return index of input element for the given kernel and output index + inline int64_t index(int64_t kj, int64_t oj) const { + return oj * SJ + kj * DJ - PJ; + } + + // Return index of first output within bounds for this kernel index + inline int64_t valid_output_start(int64_t kj) const { + int64_t ij = index(kj, 0);; + return ij < 0 ? at::divup(-ij, SJ) : 0; + } + + // Return index one past last output within bounds for this kernel index + inline int64_t valid_output_end(int64_t kj) const { + int64_t ij = index(kj, OW - 1); + return ij >= IW ? OW - at::divup(ij - (IW - 1), SJ) : OW; + } +}; + +using pooling_fn = void (*)(Tensor&, const Tensor&, const PoolingParams1D&); + +DECLARE_DISPATCH(pooling_fn, max_pool1d_stub); + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/NonEmptyUtils.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/NonEmptyUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..3d18bc5e1525bacaf27d97a86024540236ce6220 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/NonEmptyUtils.h @@ -0,0 +1,27 @@ +#include +#include +#include + +namespace at::native { + +inline int64_t ensure_nonempty_dim(int64_t dim) { + return std::max(dim, 1); +} + +inline int64_t ensure_nonempty_size(const TensorBase &t, int64_t dim) { + return t.dim() == 0 ? 1 : t.size(dim); +} + +inline int64_t ensure_nonempty_stride(const TensorBase &t, int64_t dim) { + return t.dim() == 0 ? 1 : t.stride(dim); +} + +using IdxVec = std::vector; +inline IdxVec ensure_nonempty_vec(IdxVec vec) { + if (vec.empty()) { + vec.push_back(1); + } + return vec; +} + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Padding.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Padding.h new file mode 100644 index 0000000000000000000000000000000000000000..0834361342823576f1259f7ab4910761d70fe7bf --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Padding.h @@ -0,0 +1,62 @@ +#pragma once + +#include +#include + +namespace at::native { + +using padding_fn = void (*)(const Tensor&, const Tensor&, IntArrayRef); + +// reflection padding +DECLARE_DISPATCH(padding_fn, reflection_pad1d_kernel); +DECLARE_DISPATCH(padding_fn, reflection_pad1d_backward_kernel); +DECLARE_DISPATCH(padding_fn, reflection_pad2d_kernel); +DECLARE_DISPATCH(padding_fn, reflection_pad2d_backward_kernel); +DECLARE_DISPATCH(padding_fn, reflection_pad3d_kernel); +DECLARE_DISPATCH(padding_fn, reflection_pad3d_backward_kernel); + +// replication padding +DECLARE_DISPATCH(padding_fn, replication_pad1d_kernel); +DECLARE_DISPATCH(padding_fn, replication_pad1d_backward_kernel); +DECLARE_DISPATCH(padding_fn, replication_pad2d_kernel); +DECLARE_DISPATCH(padding_fn, replication_pad2d_backward_kernel); +DECLARE_DISPATCH(padding_fn, replication_pad3d_kernel); +DECLARE_DISPATCH(padding_fn, replication_pad3d_backward_kernel); + +namespace padding { + +template +static inline void check_valid_input(const Tensor& input, IntArrayRef padding) { + + TORCH_CHECK(padding.size() == 2 * dim, + "padding size is expected to be ", 2 * dim, + ", but got: ", padding.size()); + + int input_dim = input.dim(); + + bool is_batch_mode = input_dim == (dim + 2); + + bool valid_batch_mode = is_batch_mode; + bool valid_non_batch_mode = !is_batch_mode; + + if (is_batch_mode) { + // allow batch size of 0-dim. + for (const auto d : c10::irange(1, input_dim)) { + valid_batch_mode = valid_batch_mode && input.size(d) != 0; + } + } else { + for (const auto d : c10::irange(0, input_dim)) { + valid_non_batch_mode = valid_non_batch_mode && input.size(d) != 0; + } + } + + // allow empty batch size but not other dimensions. + TORCH_CHECK(valid_batch_mode || valid_non_batch_mode, + "Expected ", dim + 1, "D or ", dim + 2, + "D (batch mode) tensor with possibly 0 batch size and other non-zero dimensions for input, but got: ", + input.sizes()); +} + +} // namespace padding + +} // at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/PointwiseOps.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/PointwiseOps.h new file mode 100644 index 0000000000000000000000000000000000000000..d2e2d44db2af1e74eda272afa6fa80729ab2a2db --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/PointwiseOps.h @@ -0,0 +1,28 @@ +// Ternary and higher-order pointwise operations +#pragma once + +#include + +namespace c10 { +class Scalar; +} + +namespace at { + +struct TensorIterator; +struct TensorIteratorBase; + +namespace native { + +using pointwise_fn = void (*)(TensorIterator&, const Scalar& scalar); +using structured_pointwise_fn = void (*)(TensorIteratorBase&, const Scalar& scalar); +using pointwise_fn_double = void (*)(TensorIterator&, const Scalar&, double); + +DECLARE_DISPATCH(structured_pointwise_fn, addcmul_stub); +DECLARE_DISPATCH(structured_pointwise_fn, addcdiv_stub); +DECLARE_DISPATCH(pointwise_fn_double, smooth_l1_backward_stub); +DECLARE_DISPATCH(pointwise_fn_double, huber_backward_stub); +DECLARE_DISPATCH(pointwise_fn, mse_backward_stub); + +} // namespace native +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Pool.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Pool.h new file mode 100644 index 0000000000000000000000000000000000000000..471c9532807e8e926458e7227cfeca2544b301f8 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Pool.h @@ -0,0 +1,340 @@ +#include +#include +#include +#include +#include + +#include + +#pragma once + +namespace at::native { + +using max_pool2d_fn = void(*)(const Tensor& output, const Tensor& indices, const Tensor& input, + int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH); +using max_pool2d_backward_fn = void(*)(const Tensor& grad_input, const Tensor& grad_output, const Tensor& indices); + +DECLARE_DISPATCH(max_pool2d_fn, max_pool2d_kernel); +DECLARE_DISPATCH(max_pool2d_backward_fn, max_pool2d_backward_kernel); + +// averge pooling has same signature for forward and backward +using avg_pool2d_fn = void(*)(const Tensor& output, const Tensor& input, int64_t kW, int64_t kH, + int64_t dW, int64_t dH, int64_t padW, int64_t padH, bool count_include_pad, c10::optional divisor_override); +using avg_pool2d_backward_fn = void(*)(const Tensor& output, const Tensor& input, int kW, int kH, + int dW, int dH, int padW, int padH, bool count_include_pad, c10::optional divisor_override); + +DECLARE_DISPATCH(avg_pool2d_fn, avg_pool2d_kernel); +DECLARE_DISPATCH(avg_pool2d_backward_fn, avg_pool2d_backward_kernel); + +using max_pool3d_fn = void(*)(Tensor& output, Tensor& indices, const Tensor& input, + int kW, int kH, int kD, int dW, int dH, int dD, int pW, int pH, int pD, int dilationW, int dilationH, int dilationD); +using max_pool3d_backward_fn = void(*)(Tensor& grad_input, const Tensor& grad_output, const Tensor& indices); + +DECLARE_DISPATCH(max_pool3d_fn, max_pool3d_kernel); +DECLARE_DISPATCH(max_pool3d_backward_fn, max_pool3d_backward_kernel); +namespace { + +template +static inline dest_t +safe_downcast(src_t v) +{ + TORCH_CHECK(std::numeric_limits::min() <= v && v <= std::numeric_limits::max(), + "integer out of range"); + + return static_cast(v); +} + +template +static inline T pooling_output_shape_pad_lr( + T inputSize, T kernelSize, T pad_l, T pad_r, T stride, T dilation, + bool ceil_mode) { + T outputSize = div_rtn( + inputSize + pad_l + pad_r - dilation * (kernelSize - 1) - 1 + + (ceil_mode ? stride - 1 : 0), stride) + 1; + if (ceil_mode) { + // ensure that the last pooling starts inside the image + // needed to avoid problems in ceil mode + if ((outputSize - 1) * stride >= inputSize + pad_l) { + --outputSize; + } + } + return outputSize; +} + +template +static inline T pooling_output_shape( + T inputSize, T kernelSize, T pad, T stride, T dilation, bool ceil_mode) { + TORCH_CHECK(stride != 0, "stride should not be zero"); + TORCH_CHECK(pad >= 0, + "pad must be non-negative, but got pad: ", pad); + TORCH_CHECK(pad <= ((kernelSize - 1) * dilation + 1) / 2, + "pad should be at most half of effective kernel size, but got pad=", + pad, ", kernel_size=", kernelSize, " and dilation=", dilation) + return pooling_output_shape_pad_lr( + inputSize, kernelSize, pad, pad, stride, dilation, ceil_mode); +} + +template +std::pair _pooling_same_mode_padding_lr( + T inputSize, T kernelSize, T stride, T dilation) { + // NOTE: with strides, the output shape is ceil(inputSize/stride) + auto total_padding = T(dilation) * (kernelSize - 1); + + // Prefer symmetric padding if possible + if (stride > 2 && (total_padding % 2 == 1)) { + // The floor in the output size calculation gives us a little wiggle room + auto wiggle_room = inputSize % stride - 1; + if (wiggle_room > 0) { + total_padding = total_padding - 1; + } + } + + auto left = total_padding / 2; + return {left, total_padding - left}; +} + +inline std::pair pooling_same_mode_padding_lr( + int64_t inputSize, int64_t kernelSize, int64_t stride, int64_t dilation) { + return _pooling_same_mode_padding_lr(inputSize, kernelSize, stride, dilation); +} + +inline std::pair pooling_same_mode_padding_lr( + c10::SymInt inputSize, c10::SymInt kernelSize, c10::SymInt stride, c10::SymInt dilation) { + return _pooling_same_mode_padding_lr(std::move(inputSize), std::move(kernelSize), std::move(stride), std::move(dilation)); +} + +// AveragePool2d/DilatedMaxPool2d (forward) +static inline void +pool2d_shape_check( + const Tensor& input, + int kH, int kW, int dH, int dW, int padH, int padW, int dilationH, int dilationW, + int64_t nInputPlane, + int64_t inputHeight, int64_t inputWidth, + int64_t outputHeight, int64_t outputWidth, MemoryFormat memory_format) +{ + const int64_t ndim = input.ndimension(); + const int64_t nOutputPlane = nInputPlane; + + TORCH_CHECK(kW > 0 && kH > 0, + "kernel size should be greater than zero, but got ", + "kH: ", kH, " kW: ", kW); + TORCH_CHECK(dW > 0 && dH > 0, + "stride should be greater than zero, but got " + "dH: ", dH, " dW: ", dW); + TORCH_CHECK(dilationH > 0 && dilationW > 0, + "dilation should be greater than zero, but got ", + "dilationH: ", dilationH, " dilationW: ", dilationW); + + bool valid_dims = input.size(1) != 0 && input.size(2) != 0; + if (memory_format == at::MemoryFormat::ChannelsLast){ + // Expect tensor in NHWC format and allow 0-dim only for N. + TORCH_CHECK((ndim == 4 && valid_dims && input.size(3) != 0), + "Expected 4D (batch mode) tensor expected for input with channels_last layout" + " with optional 0 dim batch size for input, but got: ", input.sizes()); + } else { + TORCH_CHECK((ndim == 3 && input.size(0) != 0 && valid_dims) || + (ndim == 4 && valid_dims && input.size(3) != 0), + "Expected 3D or 4D (batch mode) tensor with optional 0 dim batch size for input, but got:", + input.sizes()); + } + + TORCH_CHECK(kW/2 >= padW && kH/2 >= padH, + "pad should be smaller than or equal to half of kernel size, but got ", + "padW = ", padW, ", padH = ", padH, ", kW = ", kW, ", kH = ", kH); + + TORCH_CHECK(outputWidth >= 1 && outputHeight >= 1, + "Given input size: (", + nInputPlane, "x", inputHeight, "x", inputWidth, "). ", + "Calculated output size: (", + nOutputPlane, "x", outputHeight, "x", outputWidth, "). ", + "Output size is too small"); +} + +// DilatedMaxPool2d (backward) +static inline void +max_pool2d_backward_shape_check( + const Tensor& input, + const Tensor& gradOutput, + const Tensor& indices, + int kH, int kW, int dH, int dW, int padH, int padW, int dilationH, int dilationW, + int64_t nInputPlane, + int64_t inputHeight, int64_t inputWidth, + int64_t outputHeight, int64_t outputWidth, MemoryFormat memory_format) +{ + pool2d_shape_check( + input, + kH, kW, dH, dW, padH, padW, dilationH, dilationW, + nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, memory_format); + + const int64_t ndim = input.ndimension(); + const int64_t nOutputPlane = nInputPlane; + + check_dim_size(gradOutput, ndim, ndim-3, nOutputPlane); + check_dim_size(gradOutput, ndim, ndim-2, outputHeight); + check_dim_size(gradOutput, ndim, ndim-1, outputWidth); + + check_dim_size(indices, ndim, ndim-3, nOutputPlane); + check_dim_size(indices, ndim, ndim-2, outputHeight); + check_dim_size(indices, ndim, ndim-1, outputWidth); +} + +// AveragePool2d (backward) +static inline void +avg_pool2d_backward_shape_check( + const Tensor& input, + const Tensor& gradOutput, + int64_t /*nbatch*/, + int kH, int kW, int dH, int dW, int padH, int padW, + int64_t nInputPlane, + int64_t inputHeight, int64_t inputWidth, + int64_t outputHeight, int64_t outputWidth, + MemoryFormat memory_format) +{ + pool2d_shape_check( + input, + kH, kW, dH, dW, padH, padW, 1, 1, + nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, + memory_format); + + const int64_t ndim = input.ndimension(); + const int64_t nOutputPlane = nInputPlane; + + check_dim_size(gradOutput, ndim, ndim-3, nOutputPlane); + check_dim_size(gradOutput, ndim, ndim-2, outputHeight); + check_dim_size(gradOutput, ndim, ndim-1, outputWidth); +} + +// AveragePool3d/DilatedMaxPool3d (forward) +static inline void +pool3d_shape_check( + const Tensor& input, + int64_t nslices, + int kT, int kH, int kW, + int dT, int dH, int dW, + int pT, int pH, int pW, + int dilationT, int dilationH, int dilationW, + int64_t itime, int64_t iheight, int64_t iwidth, + int64_t otime, int64_t oheight, int64_t owidth, + const char *fn_name, + bool check_input_size=false) +{ + const int64_t ndim = input.ndimension(); + + TORCH_CHECK(kT > 0 && kW > 0 && kH > 0, + "kernel size should be greater than zero, but got ", + "kT: ", kT, " kH: ", kH, " kW: ", kW); + TORCH_CHECK(dT > 0 && dW > 0 && dH > 0, + "stride should be greater than zero, but got ", + "dT: ", dT, " dH: ", dH, " dW: ", dW); + TORCH_CHECK(dilationT > 0 && dilationW > 0 && dilationH > 0, + "dilation should be greater than zero, but got ", + "dilationT: ", dilationT, " dilationH: ", dilationH, " dilationW: ", dilationW); + + TORCH_CHECK(ndim == 4 || ndim == 5, + fn_name, ": Expected 4D or 5D tensor for input, but got: ", input.sizes()); + + for (const auto i : c10::irange(ndim)) { + if (ndim == 5 && i == 0) { + // size of batch-dim can be 0. + continue; + } + TORCH_CHECK( + input.size(i) > 0, + fn_name, + ": Expected input's non-batch dimensions to have positive length," + " but input has a shape of ", + input.sizes(), + " and non-batch dimension ", + input.size(i), + " has length zero!") + } + + if (check_input_size) { // AveragePool3d + TORCH_CHECK(itime >= kT && iheight >= kH && iwidth >= kW, + "input image ", "(T: ", itime, " H: ", iheight, " W: ", iwidth, ") smaller than ", + "kernel size ", "(kT: ", kT, " kH: ", kH, " kW: ", kW, ")"); + } + + TORCH_CHECK(kT/2 >= pT && kW/2 >= pW && kH/2 >= pH, + "pad should be smaller than or equal to half of kernel size, but got " + "kT: ", kT, " kW: ", kW, " kH: ", kH, " padT: ", pT, " padW: ", pW, " padH: ", pH); + + TORCH_CHECK(otime >= 1 && owidth >= 1 && oheight >= 1, + "Given input size: (", + nslices,"x", itime, "x", iheight, "x", iwidth, "). ", + "Calculated output size: (", + nslices, "x", otime, "x", oheight, "x", owidth, "). ", + "Output size is too small"); +} + +static inline void +max_pool3d_backward_shape_check( + const Tensor& input, + const Tensor& gradOutput, + const Tensor& indices, + int64_t nslices, + int kT, int kH, int kW, + int dT, int dH, int dW, + int pT, int pH, int pW, + int dilationT, int dilationH, int dilationW, + int64_t itime, int64_t iheight, int64_t iwidth, + int64_t otime, int64_t oheight, int64_t owidth, + const char* fn_name) +{ + const int64_t ndim = input.ndimension(); + + pool3d_shape_check( + input, + nslices, + kT, kH, kW, + dT, dH, dW, + pT, pH, pW, + dilationT, dilationH, dilationW, + itime, iheight, iwidth, + otime, oheight, owidth, fn_name); + + check_dim_size(gradOutput, ndim, ndim-4, nslices); + check_dim_size(gradOutput, ndim, ndim-3, otime); + check_dim_size(gradOutput, ndim, ndim-2, oheight); + check_dim_size(gradOutput, ndim, ndim-1, owidth); + + check_dim_size(indices, ndim, ndim-4, nslices); + check_dim_size(indices, ndim, ndim-3, otime); + check_dim_size(indices, ndim, ndim-2, oheight); + check_dim_size(indices, ndim, ndim-1, owidth); +} + +static inline void +avg_pool3d_backward_shape_check( + const Tensor& input, + const Tensor& gradOutput, + int64_t nslices, + int kT, int kH, int kW, + int dT, int dH, int dW, + int pT, int pH, int pW, + int64_t itime, int64_t iheight, int64_t iwidth, + int64_t otime, int64_t oheight, int64_t owidth, + const char *fn_name) +{ + const int64_t ndim = input.ndimension(); + + pool3d_shape_check( + input, + nslices, + kT, kH, kW, + dT, dH, dW, + pT, pH, pW, + 1, 1, 1, + itime, iheight, iwidth, + otime, oheight, owidth, + fn_name, true); + + check_dim_size(gradOutput, ndim, ndim-4, nslices); + check_dim_size(gradOutput, ndim, ndim-3, otime); + check_dim_size(gradOutput, ndim, ndim-2, oheight); + check_dim_size(gradOutput, ndim, ndim-1, owidth); +} + +} // anonymous namespace + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/RNN.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/RNN.h new file mode 100644 index 0000000000000000000000000000000000000000..f3e54c2a40b425eb07fdecdb1164690bf78b4996 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/RNN.h @@ -0,0 +1,53 @@ +#pragma once + +#include +#include + +namespace at::native { + +using lstm_fn = void(*)(Tensor&, Tensor&, Tensor&, const Tensor&, TensorList, TensorList, bool, int64_t, double, bool, bool, bool); +using rnn_fn = void(*)(Tensor&, Tensor&, const Tensor&, const Tensor&, TensorList, bool, int64_t, double, bool, bool, bool); +using lstm_packed_fn = void(*)(Tensor&, Tensor&, Tensor&, const Tensor&, const Tensor&, TensorList, TensorList, bool, int64_t, double, bool, bool); +using rnn_packed_fn = void(*)(Tensor&, Tensor&, const Tensor&, const Tensor&, const Tensor&, TensorList, bool, int64_t, double, bool, bool); + +DECLARE_DISPATCH(lstm_fn, lstm_cudnn_stub); +DECLARE_DISPATCH(lstm_fn, lstm_miopen_stub); +DECLARE_DISPATCH(lstm_fn, lstm_mkldnn_stub); +DECLARE_DISPATCH(rnn_fn, gru_cudnn_stub); +DECLARE_DISPATCH(rnn_fn, gru_miopen_stub); +DECLARE_DISPATCH(rnn_fn, rnn_tanh_cudnn_stub); +DECLARE_DISPATCH(rnn_fn, rnn_tanh_miopen_stub); +DECLARE_DISPATCH(rnn_fn, rnn_relu_cudnn_stub); +DECLARE_DISPATCH(rnn_fn, rnn_relu_miopen_stub); +DECLARE_DISPATCH(lstm_packed_fn, lstm_packed_cudnn_stub); +DECLARE_DISPATCH(lstm_packed_fn, lstm_packed_miopen_stub); +DECLARE_DISPATCH(rnn_packed_fn, gru_packed_cudnn_stub); +DECLARE_DISPATCH(rnn_packed_fn, gru_packed_miopen_stub); +DECLARE_DISPATCH(rnn_packed_fn, rnn_tanh_packed_cudnn_stub); +DECLARE_DISPATCH(rnn_packed_fn, rnn_tanh_packed_miopen_stub); +DECLARE_DISPATCH(rnn_packed_fn, rnn_relu_packed_cudnn_stub); +DECLARE_DISPATCH(rnn_packed_fn, rnn_relu_packed_miopen_stub); + +inline void check_attributes(const Tensor& input, const TensorList& params, const TensorList& hiddens, bool check_dtype=false) { + auto input_device = input.device(); + auto input_dtype = input.scalar_type(); + + auto check_tensors = [&](const std::string& name, const Tensor& t) { + if (!t.defined()) return; + auto t_device = t.device(); + TORCH_CHECK(input_device == t_device, + "Input and ", name, " tensors are not at the same device, found input tensor at ", + input_device, " and ", name, " tensor at ", t_device); + if (check_dtype) { + auto t_dtype = t.scalar_type(); + TORCH_CHECK(input_dtype == t_dtype, + "Input and ", name, " tensors are not the same dtype, found input tensor with ", + input_dtype, " and ", name, " tensor with ", t_dtype); + } + }; + + for (const auto& h : hiddens) check_tensors("hidden", h); + for (const auto& p : params) check_tensors("parameter", p); +} + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Repeat.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Repeat.h new file mode 100644 index 0000000000000000000000000000000000000000..a90ed815f935221ba03346c04e76c0f8830709e2 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Repeat.h @@ -0,0 +1,48 @@ +#pragma once + +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include +#endif + +namespace at::native { + +template < + typename index_t, + void compute(index_t*, int64_t*, index_t*, int64_t, int64_t)> +static inline Tensor repeat_interleave_common( + const Tensor& repeats, + c10::optional output_size) { + TORCH_CHECK( + repeats.dim() == 1, "repeat_interleave only accept 1D vector as repeat"); + TORCH_CHECK( + repeats.scalar_type() == at::kLong || repeats.scalar_type() == at::kInt, + "repeats has to be Long or Int tensor"); + if (repeats.size(0) == 0) { + return at::empty_like(repeats, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + } + Tensor repeats_ = repeats.contiguous(); + Tensor cumsum = repeats.cumsum(0); + int64_t total; + if (output_size.has_value()) { + total = output_size.value(); + } else { + total = cumsum[-1].item(); + TORCH_CHECK( + (repeats >= 0).all().item(), "repeats can not be negative"); + } + + Tensor result = at::empty({total}, repeats.options()); + index_t* repeat_ptr = repeats_.data_ptr(); + int64_t* cumsum_ptr = cumsum.data_ptr(); + index_t* result_ptr = result.data_ptr(); + compute(repeat_ptr, cumsum_ptr, result_ptr, repeats.size(0), total); + return result; +} + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Resize.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Resize.h new file mode 100644 index 0000000000000000000000000000000000000000..0a1f21298957d3dcd2984211fc1bb24ace71e75c --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Resize.h @@ -0,0 +1,173 @@ +#pragma once + +#include +#include +#include +#include + +#include + +#include + + +namespace at::native { + +// TODO: make all operations that resize given outputs use this function +// for consistency and maintainability. +// Some operations like `cat` might not be able to make the use of +// resize_output directly. For more details to understand how it works in `cat`, +// see https://github.com/pytorch/pytorch/pull/62560#discussion_r687363362 +// Resizes outputs +// Functions accepting output tensors, like with the "out" kwarg, should +// call this function to handle resizing their output tensor. +// Issues a warning if the output tensor has one or more elements and +// needs resizing +// NOTE: In the future the warning will become an error +// Returns a bool saying whether or not the resize actually happened or not +TORCH_API bool resize_output(const Tensor& output, IntArrayRef shape); +// WARNING: Do NOT call this directly. If you are resizing an output and want +// to support dynamic shapes call at::resize__symint and resize_output_check_symint. +// For more details, see: https://github.com/pytorch/pytorch/pull/111530/files#r1365845272 +TORCH_API bool resize_output_symint(const Tensor& output, SymIntArrayRef shape); + +// Utility for resize_output +// Returns a bool saying resize should happen or not and +// raises a warning if resizing for one or more elements +TORCH_API bool resize_output_check(const Tensor& output, IntArrayRef shape); +TORCH_API bool resize_output_check_symint(const Tensor& output, SymIntArrayRef shape); + +TORCH_API void resize_bytes_cpu(StorageImpl* storage, size_t size_bytes); +TORCH_API void resize_bytes_meta(StorageImpl* storage, c10::SymInt size_bytes); +TORCH_API void resize_bytes_nocuda(const Storage& storage, c10::SymInt size_bytes); + +static inline void maybe_resize_storage_cpu(TensorImpl* self, size_t new_size_bytes) { + // It does not make sense to try to resize a storage + // to hold 0 elements, and this can break + // if storage_offset is positive but + // new_size is 0, so just bail in that case + // (same comment is in cuda/Resize.h) + if (self->numel() == 0) { + return; + } + + const Storage& storage = self->unsafe_storage(); + if (!storage) { + auto new_storage = c10::make_intrusive( + StorageImpl::use_byte_size_t(), + new_size_bytes, + c10::GetCPUAllocator(), + true); + self->set_storage_keep_dtype(std::move(new_storage)); + } else if (new_size_bytes > storage.nbytes()) { + resize_bytes_cpu(storage.unsafeGetStorageImpl(), new_size_bytes); + } +} + +TORCH_API TensorImpl* resize_impl_cpu_( + TensorImpl* self, + IntArrayRef size, + at::OptionalIntArrayRef stride, + bool resize_storage = true); + +template +T maybe_convert_symint(c10::SymInt) = delete; + +template <> +inline c10::SymInt maybe_convert_symint(c10::SymInt x) { return x; } + +template <> +inline int64_t maybe_convert_symint(c10::SymInt x) { return x.guard_int(__FILE__, __LINE__); } + +template +static inline void checkInBoundsForStorage( + ArrayRef size, + ArrayRef stride, + T storage_offset, + const caffe2::TypeMeta& data_type, + const Storage& new_storage) { + T storage_size_bytes = + at::detail::computeStorageNbytes(size, stride, data_type.itemsize()); + T storage_offset_bytes = storage_offset * data_type.itemsize(); + if (storage_size_bytes == 0) { + // NB: (a tensor with arbitrary 0 dims)'s storage can have any numel. + return; + } + T new_storage_size_bytes = maybe_convert_symint(new_storage.sym_nbytes()); + TORCH_CHECK( + storage_size_bytes + storage_offset_bytes <= new_storage_size_bytes, + "setStorage: sizes ", + size, + ", strides ", + stride, + "," + " storage offset ", + storage_offset, + ", and itemsize ", + data_type.itemsize(), + " requiring a storage size of ", + storage_size_bytes + storage_offset_bytes, + " are out of bounds for storage of size ", + new_storage_size_bytes); +} + +template +static inline void checkSetStorage(Tensor& result, Storage storage, T storage_offset, + ArrayRef size, ArrayRef stride) { + // FIXME: stride should be optional + if (stride.data()) { + TORCH_CHECK(size.size() == stride.size(), "unequal size length (", size.size(), + ") and stride length (", stride.size(), ")"); + } + +#ifdef DEBUG + TORCH_CHECK(size.size() <= INT_MAX, "size length (", size.size(), ") greater than INT_MAX"); +#endif + + // storage: note this can't be replaced with result.set_(storage) as the semantics of that + // function is to set the tensor size to be equal to the size of the storage. + if (!result.storage().is_alias_of(storage)) { + // Caffe2 might have tensors whose storages are null, but we + // don't allow it in PyTorch. + TORCH_INTERNAL_ASSERT(storage); + TORCH_INTERNAL_ASSERT(result.storage()); + + // We used to allow this, but this breaks device caching. + // Let's put an actual error message for this one. + TORCH_CHECK(result.storage().device() == storage.device(), + "Attempted to set the storage of a tensor on device \"", result.storage().device(), + "\" to a storage on different device \"", storage.device(), + "\". This is no longer allowed; the devices must match."); + result.unsafeGetTensorImpl()->set_storage_keep_dtype(std::move(storage)); + } + + // storageOffset + TORCH_CHECK(storage_offset >= 0, "Tensor: invalid storage offset ", storage_offset); +} + +/** + * Set self's sizes, strides, and storage_offset. + * (size, stride, storage_offset) must be in bounds for self's storage. + */ +template +inline void setStrided( + const Tensor& self, + ArrayRef size, + ArrayRef stride, + T storage_offset) { + TORCH_CHECK(size.size() == stride.size(), "mismatch in length of strides and shape"); + for (const auto& val : stride) { + TORCH_CHECK(val >= 0, + "as_strided: Negative strides are not supported at the moment, " + "got strides: ", stride); + } + + auto* self_ = self.unsafeGetTensorImpl(); + checkInBoundsForStorage( + size, stride, storage_offset, self_->dtype(), self_->storage()); + + /* storage offset */ + TORCH_CHECK(storage_offset >= 0, "Tensor: invalid storage offset ", storage_offset); + self_->set_sizes_and_strides(size, stride, c10::make_optional(storage_offset)); +} + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ResizeCommon.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ResizeCommon.h new file mode 100644 index 0000000000000000000000000000000000000000..02d1e95c42efe80892f0b1032248699d8c4e7d6d --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ResizeCommon.h @@ -0,0 +1,75 @@ +#pragma once + +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + +namespace at::native { + +template +inline T storage_size_for(ArrayRef size, ArrayRef stride) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(size.size() == stride.size(), + "storage_size_for(size, stride) requires that size and stride ", + "have the same size as a precondition."); + T storage_size = 1; + for (const auto dim : c10::irange(size.size())) { + if (size[dim] == 0) { + storage_size = 0; + break; + } + storage_size += (size[dim] - 1) * stride[dim]; + } + return storage_size; +} + +inline const Tensor& resize_named_tensor_( + const Tensor& self, + IntArrayRef size, + c10::optional optional_memory_format) { + TORCH_INTERNAL_ASSERT(self.has_names()); + TORCH_CHECK( + self.sizes() == size, + "Cannot resize named tensor with resize_ or resize_as_ (tried to resize " + "Tensor", + self.names(), + " with size ", + self.sizes(), + " to ", + size, + "). This may be caused by passing a named tensor ", + "as an `out=` argument; please ensure that the sizes are the same. "); + TORCH_CHECK( + !optional_memory_format.has_value(), + "Unsupported memory format for named tensor resize ", + optional_memory_format.value()); + return self; +} + +// For deterministic output, fill new elements that were added after a storage +// resize with NaN or MAX_INT. `old_storage_nbytes` is the size of the storage +// before the resize happened. +inline const Tensor& fill_resize_deterministic_(const Tensor& tensor, int64_t old_storage_nbytes) { + const at::Storage& storage = tensor.unsafeGetTensorImpl()->unsafe_storage(); + int64_t new_storage_nbytes = storage.nbytes(); + int64_t old_storage_numel = old_storage_nbytes / tensor.itemsize(); + int64_t new_storage_numel = new_storage_nbytes / tensor.itemsize(); + if (new_storage_numel > old_storage_numel) { + at::Tensor tensor_view = at::empty({}, at::TensorOptions().dtype(tensor.scalar_type()).device(tensor.device())); + tensor_view.set_( + storage, + /*storage_offset=*/old_storage_numel, + /*size=*/{new_storage_numel - old_storage_numel}, + /*stride=*/{1}); + at::native::fill_empty_deterministic_(tensor_view); + } + return tensor; +} + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/SharedReduceOps.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/SharedReduceOps.h new file mode 100644 index 0000000000000000000000000000000000000000..5b7167ee93dd29edd028a76040a6f1937c2de35d --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/SharedReduceOps.h @@ -0,0 +1,544 @@ +#pragma once +// Please note that this file is +// used across both CPU and GPU. + +#include +#include +#include +#include +#include +#if defined(__CUDACC__) +#include +#include +#elif defined(__HIPCC__) +#include +#include +#endif +#if defined(__CUDACC__) || defined(__HIPCC__) +#include +#else +#include +#define device_sqrt std::sqrt +#endif +#if defined(__CUDACC__) || defined(__HIPCC__) +template +inline C10_DEVICE scalar_t max_propagate_nan(scalar_t a, scalar_t b) { +#if defined(__HIPCC__) + // TODO: remove this special case for HIP when issue is fixed: + // https://github.com/ROCm-Developer-Tools/HIP/issues/2209 + scalar_t max = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::max(a, b)); +#else + scalar_t max = at::_isnan(b) ? b : std::max(a, b); +#endif + return max; +} +template +inline C10_DEVICE scalar_t min_propagate_nan(scalar_t a, scalar_t b) { +#if defined(__HIPCC__) + // TODO: remove this special case for HIP when issue is fixed: + // https://github.com/ROCm-Developer-Tools/HIP/issues/2209 + scalar_t min = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::min(a, b)); +#else + scalar_t min = at::_isnan(b) ? b : std::min(a, b); +#endif + return min; +} +#define MAX(X, Y) max_propagate_nan(X,Y) +#define MIN(X, Y) min_propagate_nan(X,Y) +#else +#include +#define MAX(X, Y) max_impl(X,Y) +#define MIN(X, Y) min_impl(X,Y) +#endif + +// ROCM hcc doesn't work well with using std:: in kernel functions +#if defined(__CUDA_ARCH__) +#include +#define compat_pow c10::cuda::compat::pow +#elif defined(__HIPCC__) +#include +#define compat_pow c10::hip::compat::pow +#else +#define compat_pow std::pow +#endif + +namespace at { namespace native { + +namespace detail { + +#if defined(__CUDACC__) || defined(__HIPCC__) +template using pair = thrust::pair; +#else +template using pair = std::pair; +#endif + +} // namespace detail + +template +struct WelfordData { + scalar_t mean; + scalar_t m2; + index_t n; + scalar_t nf; + + C10_HOST_DEVICE WelfordData() : mean(0), m2(0), n(0), nf(0) {} + + C10_HOST_DEVICE WelfordData( + scalar_t mean, + scalar_t m2, + index_t n, + scalar_t nf) + : mean(mean), m2(m2), n(n), nf(nf) {} +}; + + +template +struct WelfordOps { + acc_scalar_t correction; + bool take_sqrt; + public: + using acc_t = WelfordData; + inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, index_t /*idx*/) const { + // We accumulate n in index_t to avoid cumulative rounding error, but still + // need nf for use in combine where int32 may overflow. + index_t new_n = acc.n + 1; + acc_scalar_t new_nf = static_cast(new_n); + acc_scalar_t delta = data - acc.mean; + acc_scalar_t new_mean = acc.mean + delta / new_nf; + acc_scalar_t new_delta = data - new_mean; + return { + new_mean, + acc.m2 + delta * new_delta, + new_n, + new_nf, + }; + } + inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const { + if (a.nf == 0) { + return b; + } + if (b.nf == 0) { + return a; + } + acc_scalar_t delta = b.mean - a.mean; + acc_scalar_t new_count = a.nf + b.nf; + acc_scalar_t nb_over_n = b.nf / new_count; + return { + a.mean + delta * nb_over_n, + a.m2 + b.m2 + delta * delta * a.nf * nb_over_n, + // setting acc.n as -1 since acc.n might not be able to represent the count + // correctly within its range, setting it to -1 to avoid confusion + -1, + new_count + }; + } + inline C10_DEVICE res_t project(acc_t acc) const __ubsan_ignore_float_divide_by_zero__ { + const auto mean = static_cast(acc.mean); + const auto divisor = acc.nf > correction ? acc.nf - correction : 0; + const auto var = acc.m2 / divisor; + res_t results(take_sqrt ? device_sqrt(var) : var, mean); + return results; + } + + static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { + return acc; + } + +#if defined(__CUDACC__) || defined(__HIPCC__) + inline __device__ acc_t warp_shfl_down(acc_t acc, int offset) const { + return { + WARP_SHFL_DOWN(acc.mean, offset) + , WARP_SHFL_DOWN(acc.m2, offset) + , WARP_SHFL_DOWN(acc.n, offset) + , WARP_SHFL_DOWN(acc.nf, offset) + }; + } +#endif + C10_HOST_DEVICE WelfordOps(acc_scalar_t correction, bool take_sqrt) + : correction(correction), take_sqrt(take_sqrt) {} +}; + +template +struct MeanOps { + factor_t factor; + + inline C10_DEVICE acc_t reduce(acc_t a, scalar_t b, int64_t /*idx*/) const { + return combine(a, static_cast(b)); + } + + inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const { + return a + b; + } + + inline C10_DEVICE out_t project(acc_t a) const { + return a * factor; + } + + static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { + return acc; + } + +#if defined(__CUDACC__) || defined(__HIPCC__) + inline C10_DEVICE acc_t warp_shfl_down(acc_t data, int offset) const { + return WARP_SHFL_DOWN(data, offset); + } +#endif + + MeanOps(factor_t factor): factor(factor) { + } +}; + +// This accumulator template is used to calculate the minimum absolute value of +// a set of numbers. +// `scalar_t` is the type of the input and `acc_t` is the type of the accumulated +// value. These types differ for complex number input support. +template +struct AbsMinOps { + + inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const { + return MIN(acc, static_cast(std::abs(data))); + } + + inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const { + return MIN(a, b); + } + + inline C10_DEVICE out_t project(acc_t a) const { + return a; + } + + static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { + return acc; + } + +#if defined(__CUDACC__) || defined(__HIPCC__) + inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const { + return WARP_SHFL_DOWN(acc, offset); + } +#endif +}; + +// This accumulator template is used to calculate the maximum absolute value of +// a set of numbers. +// `scalar_t` is the type of the input and `acc_t` is the type of the accumulated +// value. These types differ for complex number input support. +template +struct AbsMaxOps { + inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const { + return MAX(acc, static_cast(std::abs(data))); + } + + inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const { + return MAX(a, b); + } + + inline C10_DEVICE out_t project(acc_t a) const { + return a; + } + + static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { + return acc; + } + +#if defined(__CUDACC__) || defined(__HIPCC__) + inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const { + return WARP_SHFL_DOWN(acc, offset); + } +#endif +}; + +// This accumulator template is used to calculate the norm of the absolute value +// of a set of numbers. +// `scalar_t` is the type of the input and `acc_t` is the type of the accumulated +// value. These types differ for complex number input support. +template +struct NormOps { + acc_t norm_; + + inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const { + return acc + compat_pow(static_cast(std::abs(data)), norm_); + } + + inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const { + return a + b; + } + + inline C10_DEVICE out_t project(acc_t a) const { + return compat_pow(a, static_cast(1.0) / norm_); + } + + static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { + return acc; + } + +#if defined(__CUDACC__) || defined(__HIPCC__) + inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const { + return WARP_SHFL_DOWN(acc, offset); + } +#endif + + NormOps(acc_t norm_): norm_(norm_) { + } +}; + +// This accumulator template is used to calculate the order zero norm of the +// absolute value of a set of numbers. +// `scalar_t` is the type of the input and `acc_t` is the type of the accumulated +// value. These types differ for complex number input support. +template +struct NormZeroOps { + inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const { + return acc + (data == static_cast(0) ? static_cast(0) : static_cast(1)); + } + + inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const { + return a + b; + } + + inline C10_DEVICE out_t project(acc_t a) const { + return a; + } + + static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { + return acc; + } + + +#if defined(__CUDACC__) || defined(__HIPCC__) + inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const { + return WARP_SHFL_DOWN(acc, offset); + } +#endif +}; + +// This accumulator template is used to calculate the order one norm of the +// absolute value of a set of numbers. +// `scalar_t` is the type of the input and `acc_t` is the type of the accumulated +// value. These types differ for complex number input support. +template +struct NormOneOps { + inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const { + return acc + static_cast(std::abs(data)); + } + + inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const { + return a + b; + } + + inline C10_DEVICE out_t project(acc_t a) const { + return a; + } + + static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { + return acc; + } + +#if defined(__CUDACC__) || defined(__HIPCC__) + inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const { + return WARP_SHFL_DOWN(acc, offset); + } +#endif +}; + + +template +struct AbsSwitch {}; + +template +inline C10_DEVICE acc_t abs_if_complex(scalar_t data, AbsSwitch) { + return static_cast(data); +} + +template +inline C10_DEVICE acc_t abs_if_complex(std::complex data, AbsSwitch) { + return static_cast(std::abs(data)); +} + +template +inline C10_DEVICE acc_t abs_if_complex(c10::complex data, AbsSwitch) { + return static_cast(std::abs(data)); +} + +// This accumulator template is used to calculate the order two norm of the +// absolute value of a set of numbers. +// `scalar_t` is the type of the input and `acc_t` is the type of the accumulated +// value. These types differ for complex number input support. +template +struct NormTwoOps { + inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const { + acc_t data_ = abs_if_complex(data, AbsSwitch()); + return acc + data_ * data_; + } + + inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const { + return a + b; + } + + inline C10_DEVICE out_t project(acc_t a) const { + return device_sqrt(a); + } + + static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { + return acc; + } + +#if defined(__CUDACC__) || defined(__HIPCC__) + inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const { + return WARP_SHFL_DOWN(acc, offset); + } +#endif +}; + +template +struct NanSumOps { + inline C10_DEVICE acc_t reduce(acc_t a, data_t b, int64_t /*idx*/) const { + return a + (at::_isnan(b) ? acc_t{0.} : acc_t{b}); + } + + inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const { + return a + b; + } + + inline C10_DEVICE data_t project(acc_t a) const { + return data_t{a}; + } + + static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { + return acc; + } + +#if defined(__CUDACC__) || defined(__HIPCC__) + inline C10_DEVICE acc_t warp_shfl_down(acc_t data, int offset) const { + return WARP_SHFL_DOWN(data, offset); + } +#endif +}; + +namespace detail { + +template +struct LessOrNan { + C10_DEVICE bool operator () (scalar_t a, scalar_t b, int64_t idx_a, int64_t idx_b) const { + // If (a == b), then choose the one with lower idx, else min(a, b) + if (at::_isnan(a)) { + if (at::_isnan(b)) { + return idx_a < idx_b; + } + return true; + } + return (a == b) ? idx_a < idx_b : (a < b); + } +}; + +template +struct GreaterOrNan { + C10_DEVICE bool operator () (scalar_t a, scalar_t b, int64_t idx_a, int64_t idx_b) const { + // If (a == b), then choose the one with lower idx, else max(a, b) + if (at::_isnan(a)) { + if (at::_isnan(b)) { + return idx_a < idx_b; + } + return true; + } + return (a == b) ? idx_a < idx_b : (a > b); + } +}; + +template +struct MinMaxReductionOps { + using scalar_t = typename binary_function_traits::arg1_t; + using index_t = int64_t; + using arg_t = detail::pair; + + static C10_DEVICE arg_t project(arg_t arg) { + return arg; + } + + static C10_DEVICE arg_t reduce(arg_t arg, scalar_t val, int64_t idx) { + return comp_t{}(arg.first, val, arg.second, idx) ? arg : arg_t(val, idx); + } + + static C10_DEVICE arg_t combine(arg_t a, arg_t b) { + return comp_t{}(a.first, b.first, a.second, b.second) ? a : b; + } + + static C10_DEVICE arg_t translate_idx(arg_t a, int64_t base_idx) { + return {a.first, a.second + base_idx}; + } + +#if defined(__CUDACC__) || defined(__HIPCC__) + static C10_DEVICE arg_t warp_shfl_down(arg_t arg, int offset) { + return arg_t(WARP_SHFL_DOWN(arg.first, offset), + WARP_SHFL_DOWN(arg.second, offset)); + } +#endif +}; + +template +struct ArgReductionOps : public MinMaxReductionOps { + using typename MinMaxReductionOps::scalar_t; + using typename MinMaxReductionOps::index_t; + using typename MinMaxReductionOps::arg_t; + + static C10_DEVICE index_t project(arg_t arg) { + return arg.second; + } +}; + +} // namespace detail + +template +struct ArgMaxOps : + public detail::ArgReductionOps> { +}; + +template +struct ArgMinOps : + public detail::ArgReductionOps> { +}; + +template +struct MinOps : + public detail::MinMaxReductionOps> { +}; + +template +struct MaxOps : + public detail::MinMaxReductionOps> { +}; + +template +struct MinMaxOps { + using acc_t = detail::pair; + inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, index_t /*idx*/) const { + return combine(acc, {data, data}); + } + + inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const { + auto min_val = (at::_isnan(a.first) || a.first < b.first) ? a.first : b.first; + auto max_val = (at::_isnan(a.second) || a.second > b.second) ? a.second : b.second; + + return {min_val, max_val}; + } + + inline C10_DEVICE acc_t project(acc_t acc) const { + return acc; + } + + static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { + return acc; + } + +#if defined(__CUDACC__) || defined(__HIPCC__) + inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const { + return { + WARP_SHFL_DOWN(acc.first, offset), WARP_SHFL_DOWN(acc.second, offset) + }; + } +#endif +}; + +}} // namespace at::native + +#undef MAX +#undef MIN diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/SparseTensorUtils.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/SparseTensorUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..3953ca0d17b57f2547bafd2d08bec577b83b713e --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/SparseTensorUtils.h @@ -0,0 +1,190 @@ +#pragma once + +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include +#endif + +namespace at::sparse { + +// Just for documentary purposes +using SparseTensor = Tensor; +using SparseType = Type; + +// This is an internal utility function for getting at the SparseTensorImpl, +// so that we can write sparse tensor specific accessors for special fields +// in SparseTensor. You should only use this for writing low level +// setters/getters for SparseTensorImpl fields; otherwise, you should use +// the low level setters/getters that were implemented using this. +// +// This may be called repeatedly, so make sure it's pretty cheap. +inline SparseTensorImpl* get_sparse_impl(const SparseTensor& self) { + TORCH_INTERNAL_ASSERT( + self.is_sparse(), "_internal_get_SparseTensorImpl: not a sparse tensor"); + return static_cast(self.unsafeGetTensorImpl()); +} + +// Takes indices and values and directly puts them into the sparse tensor, no +// copy. This used to be called THSTensor_(_move) +inline void alias_into_sparse( + const SparseTensor& self, + const Tensor& indices, + const Tensor& values) { + get_sparse_impl(self)->set_indices_and_values_unsafe(indices, values); +} + +// Take indices and values and makes a (data) copy of them to put into the +// sparse indices/values. This used to be called THSTensor_(_set) +inline void copy_into_sparse( + const SparseTensor& self, + const Tensor& indices, + const Tensor& values, + bool non_blocking) { + alias_into_sparse( + self, + indices.to(self._indices().options(), non_blocking, /*copy=*/true), + values.to(self._values().options(), non_blocking, /*copy=*/true)); +} + +// TODO: put this into the public API +inline bool is_same_tensor(const Tensor& lhs, const Tensor& rhs) { + return lhs.unsafeGetTensorImpl() == rhs.unsafeGetTensorImpl(); +} + +inline bool is_same_density(const SparseTensor& self, const SparseTensor& src) { + return self.sparse_dim() == src.sparse_dim() && + self.dense_dim() == src.dense_dim(); +} + +// Give us a new values tensor, with the same dimensionality +// as 'values' but with a new number of non-zero elements. +// TODO: Expose this for real in ATen, some day? +// NB: Doesn't preserve data. +inline Tensor new_values_with_size_of(const Tensor& values, int64_t nnz) { + std::vector size = values.sizes().vec(); + size[0] = nnz; + return at::empty(size, values.options()); +} + +// NOTE [ Flatten Sparse Indices ] +// This helper function flattens a sparse indices tensor (a Tensor) into a 1D +// indices tensor. E.g., +// input = [[2, 4, 0], +// [3, 1, 10]] +// full_size = [2, 12] +// output = [ 2 * 12 + 3, 4 * 12 + 1, 0 * 12 + 10 ] = [27, 49, 10] +// +// In other words, assuming that each `indices[i, :]` is a valid index to a +// tensor `t` of shape `full_size`. This returns the corresponding indices to +// the flattened tensor `t.reshape( prod(full_size[:indices.size(0)]), -1 )`. +// if forceClone is true, the result will forced to be a clone of self. +// if force_clone is true, the result will forced to be a clone of self. +TORCH_API Tensor flatten_indices( + const Tensor& indices, + IntArrayRef full_size, + bool force_clone = false); + +// Flatten sparse tensor's indices from nD to 1D, similar to NOTE [ Flatten +// Sparse Indices ], except this one allows partial flatten: only flatten on +// specified dims. Note that the flatten indices might be uncoalesced if +// dims_to_flatten.size() < sparse_dim. Also if input indices is already +// coalesced, the flattened indices will also be sorted. +// +// args: +// indices: sparse tensor indices +// sizes: sparse tensor sizes +// dims_to_flatten: a list of dim index to flatten +// +// Ex1: +// indices = [[2, 4, 0], +// [3, 1, 3]] +// sizes = [2, 12] +// dims_to_flatten = [0, 1] +// new_indices = [ 2 * 12 + 3, 4 * 12 + 1, 0 * 12 + 3 ] = [27, 49, 3] +// +// Ex2: +// dims_to_flatten = [1] +// new_indices = [ 3, 1, 3 ] # uncoalesced +TORCH_API Tensor flatten_indices_by_dims( + const Tensor& indices, + const IntArrayRef& sizes, + const IntArrayRef& dims_to_flatten); + +// Find the CSR representation for a row `indices` from the COO format +TORCH_API Tensor coo_to_csr(const int64_t* indices, int64_t dim, int64_t nnz); + +TORCH_API Tensor zeros_like_with_indices(const Tensor& t); + +template +class TensorGeometryHolder { + using geometry_holder_t = std::array; + + public: + explicit TensorGeometryHolder( + IntArrayRef sizes, + IntArrayRef strides, + TensorOptions options = {}) { + std::copy(sizes.begin(), sizes.end(), t_sizes.begin()); + std::copy(strides.begin(), strides.end(), t_strides.begin()); + } + + explicit TensorGeometryHolder(const Tensor& t) + : TensorGeometryHolder(t.sizes(), t.strides()) {} + + auto operator*() const { + return std::make_tuple(t_sizes, t_strides); + } + + private: + geometry_holder_t t_sizes; + geometry_holder_t t_strides; +}; + +template <> +class TensorGeometryHolder<0> { + using geometry_holder_t = Tensor; + + public: + explicit TensorGeometryHolder( + IntArrayRef sizes, + IntArrayRef strides, + TensorOptions options) { + const int64_t t_ndims = sizes.size(); + const auto cpu_options = TensorOptions(options).dtype(kLong).device(kCPU); + Tensor t_sizes_and_strides_cpu = at::empty({2, t_ndims}, cpu_options); + t_sizes_and_strides_cpu.select(0, 0).copy_(at::tensor(sizes, cpu_options)); + t_sizes_and_strides_cpu.select(0, 1).copy_( + at::tensor(strides, cpu_options)); + const Tensor t_sizes_and_strides = + t_sizes_and_strides_cpu.to(options.device()); + t_sizes = t_sizes_and_strides.select(0, 0); + t_strides = t_sizes_and_strides.select(0, 1); + } + + explicit TensorGeometryHolder(const Tensor& t) + : TensorGeometryHolder(t.sizes(), t.strides(), t.options()) {} + + auto operator*() const { + return std::make_tuple( + t_sizes.template data_ptr(), + t_strides.template data_ptr()); + } + + private: + geometry_holder_t t_sizes; + geometry_holder_t t_strides; +}; + +// Return all indices of a tensor with the given shape. +// +// full_coo_indices(shape) is equivalent to +// torch.ones(shape).nonzero().transpose(-2, -1) but much faster. +TORCH_API Tensor full_coo_indices(IntArrayRef sizes, TensorOptions options); + +} // namespace at::sparse diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/StridedRandomAccessor.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/StridedRandomAccessor.h new file mode 100644 index 0000000000000000000000000000000000000000..ad8f6d7b8830f10575e75c65758cc846cc800ae6 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/StridedRandomAccessor.h @@ -0,0 +1,301 @@ +#pragma once + +namespace at::native { + +// (Const)StridedRandomAccessor is a +// (const) random access iterator defined over +// a strided array. + +// The traits below are to introduce __restrict__ +// modifier on different platforms. + +template +struct DefaultPtrTraits { + using PtrType = T*; +}; + +#if (defined(_WIN32) || defined(_WIN64)) +#define RESTRICT __restrict +#else +#define RESTRICT __restrict__ +#endif + +template +struct RestrictPtrTraits { + using PtrType = T* RESTRICT; +}; + +template < + typename T, + typename index_t = int64_t, + template class PtrTraits = DefaultPtrTraits +> +class ConstStridedRandomAccessor { +public: + using difference_type = index_t; + using value_type = const T; + using pointer = const typename PtrTraits::PtrType; + using reference = const value_type&; + using iterator_category = std::random_access_iterator_tag; + + using PtrType = typename PtrTraits::PtrType; + using index_type = index_t; + + // Constructors { + C10_HOST_DEVICE + ConstStridedRandomAccessor(PtrType ptr, index_t stride) + : ptr{ptr}, stride{stride} + {} + + C10_HOST_DEVICE + explicit ConstStridedRandomAccessor(PtrType ptr) + : ptr{ptr}, stride{static_cast(1)} + {} + + C10_HOST_DEVICE + ConstStridedRandomAccessor() + : ptr{nullptr}, stride{static_cast(1)} + {} + // } + + // Pointer-like operations { + C10_HOST_DEVICE + reference operator*() const { + return *ptr; + } + + C10_HOST_DEVICE + const value_type* operator->() const { + return reinterpret_cast(ptr); + } + + C10_HOST_DEVICE + reference operator[](index_t idx) const { + return ptr[idx * stride]; + } + // } + + // Prefix/postfix increment/decrement { + C10_HOST_DEVICE + ConstStridedRandomAccessor& operator++() { + ptr += stride; + return *this; + } + + C10_HOST_DEVICE + ConstStridedRandomAccessor operator++(int) { + ConstStridedRandomAccessor copy(*this); + ++*this; + return copy; + } + + C10_HOST_DEVICE + ConstStridedRandomAccessor& operator--() { + ptr -= stride; + return *this; + } + + C10_HOST_DEVICE + ConstStridedRandomAccessor operator--(int) { + ConstStridedRandomAccessor copy(*this); + --*this; + return copy; + } + // } + + // Arithmetic operations { + C10_HOST_DEVICE + ConstStridedRandomAccessor& operator+=(index_t offset) { + ptr += offset * stride; + return *this; + } + + C10_HOST_DEVICE + ConstStridedRandomAccessor operator+(index_t offset) const { + return ConstStridedRandomAccessor(ptr + offset * stride, stride); + } + + C10_HOST_DEVICE + friend ConstStridedRandomAccessor operator+( + index_t offset, + const ConstStridedRandomAccessor& accessor + ) { + return accessor + offset; + } + + C10_HOST_DEVICE + ConstStridedRandomAccessor& operator-=(index_t offset) { + ptr -= offset * stride; + return *this; + } + + C10_HOST_DEVICE + ConstStridedRandomAccessor operator-(index_t offset) const { + return ConstStridedRandomAccessor(ptr - offset * stride, stride); + } + + // Note that this operator is well-defined when `this` and `other` + // represent the same sequences, i.e. when + // 1. this.stride == other.stride, + // 2. |other - this| / this.stride is an Integer. + C10_HOST_DEVICE + difference_type operator-(const ConstStridedRandomAccessor& other) const { + return (ptr - other.ptr) / stride; + } + // } + + // Comparison operators { + C10_HOST_DEVICE + bool operator==(const ConstStridedRandomAccessor& other) const { + return (ptr == other.ptr) && (stride == other.stride); + } + + C10_HOST_DEVICE + bool operator!=(const ConstStridedRandomAccessor& other) const { + return !(*this == other); + } + + C10_HOST_DEVICE + bool operator<(const ConstStridedRandomAccessor& other) const { + return ptr < other.ptr; + } + + C10_HOST_DEVICE + bool operator<=(const ConstStridedRandomAccessor& other) const { + return (*this < other) || (*this == other); + } + + C10_HOST_DEVICE + bool operator>(const ConstStridedRandomAccessor& other) const { + return !(*this <= other); + } + + C10_HOST_DEVICE + bool operator>=(const ConstStridedRandomAccessor& other) const { + return !(*this < other); + } + // } + +protected: + PtrType ptr; + index_t stride; +}; + +template < + typename T, + typename index_t = int64_t, + template class PtrTraits = DefaultPtrTraits +> +class StridedRandomAccessor + : public ConstStridedRandomAccessor { +public: + using difference_type = index_t; + using value_type = T; + using pointer = typename PtrTraits::PtrType; + using reference = value_type&; + + using BaseType = ConstStridedRandomAccessor; + using PtrType = typename PtrTraits::PtrType; + + // Constructors { + C10_HOST_DEVICE + StridedRandomAccessor(PtrType ptr, index_t stride) + : BaseType(ptr, stride) + {} + + C10_HOST_DEVICE + explicit StridedRandomAccessor(PtrType ptr) + : BaseType(ptr) + {} + + C10_HOST_DEVICE + StridedRandomAccessor() + : BaseType() + {} + // } + + // Pointer-like operations { + C10_HOST_DEVICE + reference operator*() const { + return *this->ptr; + } + + C10_HOST_DEVICE + value_type* operator->() const { + return reinterpret_cast(this->ptr); + } + + C10_HOST_DEVICE + reference operator[](index_t idx) const { + return this->ptr[idx * this->stride]; + } + // } + + // Prefix/postfix increment/decrement { + C10_HOST_DEVICE + StridedRandomAccessor& operator++() { + this->ptr += this->stride; + return *this; + } + + C10_HOST_DEVICE + StridedRandomAccessor operator++(int) { + StridedRandomAccessor copy(*this); + ++*this; + return copy; + } + + C10_HOST_DEVICE + StridedRandomAccessor& operator--() { + this->ptr -= this->stride; + return *this; + } + + C10_HOST_DEVICE + StridedRandomAccessor operator--(int) { + StridedRandomAccessor copy(*this); + --*this; + return copy; + } + // } + + // Arithmetic operations { + C10_HOST_DEVICE + StridedRandomAccessor& operator+=(index_t offset) { + this->ptr += offset * this->stride; + return *this; + } + + C10_HOST_DEVICE + StridedRandomAccessor operator+(index_t offset) const { + return StridedRandomAccessor(this->ptr + offset * this->stride, this->stride); + } + + C10_HOST_DEVICE + friend StridedRandomAccessor operator+( + index_t offset, + const StridedRandomAccessor& accessor + ) { + return accessor + offset; + } + + C10_HOST_DEVICE + StridedRandomAccessor& operator-=(index_t offset) { + this->ptr -= offset * this->stride; + return *this; + } + + C10_HOST_DEVICE + StridedRandomAccessor operator-(index_t offset) const { + return StridedRandomAccessor(this->ptr - offset * this->stride, this->stride); + } + + // Note that here we call BaseType::operator- version + C10_HOST_DEVICE + difference_type operator-(const BaseType& other) const { + return (static_cast(*this) - other); + } + // } +}; + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorAdvancedIndexingUtils.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorAdvancedIndexingUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..7b9d1446a087bf9fb223e9f02bd678ace6ddb65f --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorAdvancedIndexingUtils.h @@ -0,0 +1,92 @@ +#pragma once +#include +#include +#include + +namespace at::native { +namespace { +static std::string shapes_as_str(TensorList tensors) { + std::ostringstream os; + bool first = true; + for (auto& tensor : tensors) { + if (tensor.defined()) { + if (!first) { + os << ", "; + } + os << tensor.sizes(); + first = false; + } + } + return os.str(); +} +} // anonymous namespace + +static std::tuple canDispatchToMaskedFill(const Tensor& self, const torch::List>& indices, +const Tensor& value){ + if (!(value.numel() ==1 && value.device().is_cpu())){ + return std::make_tuple(false,Tensor()); + } + int64_t num_ind = 0; + Tensor mask; + auto self_device = self.device(); + for (const c10::optional& i: indices) { + if (!i.has_value() || !(*i).defined()){ + num_ind++; + } else { + const Tensor &index = *i; + if ((index.scalar_type() != kByte && index.scalar_type() != kBool) || + index.device() != self_device || mask.defined()){ + return std::make_tuple(false, Tensor()); + } else { + mask = index; + for (const auto j : c10::irange(index.dim())) { + int64_t srcIdx = num_ind + j; + TORCH_CHECK_INDEX(index.size(j) == self.size(srcIdx), "The shape of the mask ", index.sizes(), " at index ", j, + " does not match the shape of the indexed tensor ", self.sizes(), " at index ", srcIdx); + } + num_ind += mask.ndimension(); + } + } + } + for (C10_UNUSED const auto i : c10::irange(num_ind, self.ndimension())) { + mask = mask.unsqueeze(-1); + } + return std::make_tuple(true, mask); +} + +static AdvancedIndex make_info(Tensor self, IOptTensorListRef orig) { + checkIndexTensorTypes(orig, /*allow_int*/ true); + // first expand BoolTensor (masks) or ByteTensor (masks) into 1 or more LongTensors + auto indices = expandTensors(self, orig); + // next broadcast all index tensors together + try { + indices = expand_outplace(indices); + } catch (std::exception& e) { + TORCH_CHECK_INDEX(false, "shape mismatch: indexing tensors could not be broadcast together" + " with shapes ", shapes_as_str(indices)); + } + // add missing null Tensors so that it matches self.dim() + while (indices.size() < (size_t)self.dim()) { + indices.emplace_back(); + } + // if the non-null indices are not all adjacent, transpose self and indices + // together so that they're adjacent at the front + if (!hasContiguousSubspace(indices)) { + std::tie(self, indices) = transposeToFront(self, indices); + } + // Ensure indices are on the same device as self + for (auto & indice : indices) { + if (indice.defined() && indice.device() != self.device()) { + indice = indice.to(self.device()); + } + } + for (auto & indice : indices) { + if (indice.defined() && indice.dtype() == at::kInt) { + indice = indice.to(at::kLong); + } + } + + return AdvancedIndex(self, indices); +} + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorDimApply.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorDimApply.h new file mode 100644 index 0000000000000000000000000000000000000000..4d52446446316af9546bf3a1f5c0eca58a6e608f --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorDimApply.h @@ -0,0 +1,55 @@ +#pragma once +#include +#include + +namespace at::native { +//input tensors are non-zero dim and non-empty +template + +void tensor_dim_apply3(const Tensor& self, Tensor& values, Tensor& indices, int64_t dim, Function func) { + int ndims = self.dim(); + int tensor_dim_apply_has_finished = 0; + std::vector counter(ndims, 0); + const T1* self_data = self.const_data_ptr(); + T1* values_data = values.data_ptr(); + T2* indices_data = indices.data_ptr(); + int64_t self_stride = self.stride(dim); + int64_t values_stride = values.stride(dim); + int64_t indices_stride = indices.stride(dim); + int self_dim_size = self.size(dim); + + while (!tensor_dim_apply_has_finished) { + func(self_data, values_data, indices_data, self_dim_size, self_stride, values_stride, indices_stride); + if (ndims == 1) { + break; + } + for (const auto dim_i : c10::irange(ndims)) { + if (dim_i == dim) { + if (dim_i == (ndims - 1)) { + tensor_dim_apply_has_finished = 1; + break; + } + continue; + } + counter[dim_i]++; + self_data += self.stride(dim_i); + values_data += values.stride(dim_i); + indices_data += indices.stride(dim_i); + + if (counter[dim_i] == self.size(dim_i)) { + if (dim_i == ndims-1) { + tensor_dim_apply_has_finished = 1; + break; + } else { + self_data -= counter[dim_i]*self.stride(dim_i); + values_data -= counter[dim_i]*values.stride(dim_i); + indices_data -= counter[dim_i]*indices.stride(dim_i); + counter[dim_i] = 0; + } + } else { + break; + } + } + } +} +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorFactories.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorFactories.h new file mode 100644 index 0000000000000000000000000000000000000000..f9b2893d768a96ed72fe25514e428fdb9025b7c7 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorFactories.h @@ -0,0 +1,142 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + +namespace at::native { +// Different combinations of row, col, and offset can lead to two cases: +// +// Case 1 - Trapezoid (Triangle as a special case): row + offset <= col +// Example A: offset > 0 +// 1 1 0 0 0 +// 1 1 1 0 0 +// 1 1 1 1 0 +// Example B: offset <= 0 +// 0 0 0 +// 1 0 0 +// 1 1 0 +// In this case, we calculate the number of elements in the first row and +// last row of the tril respectively, and then compute the tril size. +// +// Case 2 - Trapezoid + Rectangle: row + offset > col +// Example: +// 1 1 0 +// 1 1 1 +// 1 1 1 +// In this case, we first calculate the size of top trapezoid, and then +// calculate the size of the bottom rectangle. +inline int64_t get_tril_size(int64_t row, int64_t col, int64_t offset) { + // If either dimension is 0 then the there is no tril + if (row == 0 || col == 0) { + return 0; + } + // number of elements in the first row of the tril + auto m_first_row = offset > 0 ? + std::min(col, 1 + offset) : // upper bounded by col + row + offset > 0; // either 0 or 1 + // number of elements in the last row of the tril, bounded by [0, col] + auto m_last_row = std::max(0, std::min(col, row + offset)); + // number of rows, bounded by [0, row] + auto n_row_all = std::max(0, std::min(row, row + offset)); + auto n_row_trapezoid = (m_last_row - m_first_row + 1); + + // calculate # of elements in the top trapezoid + auto tril_size = (m_first_row + m_last_row) * n_row_trapezoid >> 1; + + // calculate # of elements in the bottom rectangle if there is any + auto diff_row = n_row_all - n_row_trapezoid; + if (diff_row > 0) { + tril_size += diff_row * col; + } + + return tril_size; +} + +inline void check_args( + int64_t row, int64_t col, c10::optional layout_opt) { + TORCH_CHECK(row >= 0, "row must be non-negative, got", row); + TORCH_CHECK(col >= 0, "col must be non-negative, got", col); + if (layout_opt.has_value()) { + TORCH_CHECK( + *layout_opt == at::kStrided, + "only support layout=torch.strided, got", + *layout_opt) + } +} + +using at::check_size_nonnegative; + +// assumes maximum value in created tensor is n-1 (e.g., torch.randperm(n)) +inline void check_supported_max_int_with_precision(int64_t n, const Tensor& tensor) { + // match defined() to behavior of checks below + TORCH_CHECK(at::scalar_tensor(n>0?n-1:n, tensor.options()).defined(), + "n is too large for result tensor type: '", tensor.toString(), "'"); + + // Ensure sufficient precision for floating point representation. + switch (tensor.scalar_type()) { + case at::ScalarType::Half: + TORCH_CHECK(n <= (int64_t(1) << 11) + 1, "n cannot be greater than 2049 for Half type."); + break; + case at::ScalarType::Float: + TORCH_CHECK(n <= (int64_t(1) << 24) + 1, "n cannot be greater than 2^24+1 for Float type."); + break; + case at::ScalarType::Double: // Unlikely to happen, but doesn't hurt to check + TORCH_CHECK(n <= (int64_t(1) << 53) + 1, "n cannot be greater than 2^53+1 for Double type."); + break; + default: + break; + } +} + +// Called by `empty*` functions when deterministic algorithms are enabled to +// fill the tensor with NaN if it is floating point or complex type, or fill +// with max value if it is integer type +inline Tensor& fill_empty_deterministic_(Tensor& tensor) { + if (tensor.is_floating_point() || tensor.is_complex()) { + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2( + kBFloat16, kHalf, tensor.scalar_type(), "fill_empty_deterministic_", [&]() { + tensor.fill_(std::numeric_limits::quiet_NaN()); + }); + } else { + AT_DISPATCH_V2( + tensor.scalar_type(), "fill_empty_deterministic_", AT_WRAP([&]() { + tensor.fill_(std::numeric_limits::max()); + }), kBool, AT_EXPAND(AT_INTEGRAL_TYPES_V2)); + } + return tensor; +} + +// The ZeroTensor allocator ignores whatever allocation is requested and always +// gives you nullptr +struct ZeroTensorAllocator final : public at::Allocator { + ZeroTensorAllocator(at::Device device) : device_(device) {}; + ~ZeroTensorAllocator() override = default; + static void deleter(void* const pointer) { + TORCH_INTERNAL_ASSERT(!pointer); + } + DataPtr allocate(const size_t /*nbytes*/) override { + return {nullptr, nullptr, &deleter, device_}; + } + DeleterFnPtr raw_deleter() const override { + return deleter; + } + void copy_data(void* dest, const void* src, std::size_t count) const final {} + at::Device device_; +}; + +using binary_fn = void (*)(TensorIterator&); + +DECLARE_DISPATCH(binary_fn, complex_stub); +DECLARE_DISPATCH(binary_fn, polar_stub); + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorIteratorDynamicCasting.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorIteratorDynamicCasting.h new file mode 100644 index 0000000000000000000000000000000000000000..a2bdd6eb13e4bde62cba1f9b5c65726abb5bb7d0 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorIteratorDynamicCasting.h @@ -0,0 +1,52 @@ +#pragma once + +#include +#include +#include +#include +#include + + +// This file includes utilities for dynamic_casting done by TensorIterator, see CUDALoops.cuh and Loops.h. + +// dynamic_casting handles when the types expected by the iterator do not match the types of the arguments +// to the function that is being called. +// On CUDA, the cast is currently pushed down into the kernel (for performance reasons). +// On CPU, there is currently an internal assert that a dynamic_cast is not needed. + +namespace at::native { + +// `needs_dynamic_casting` compares the types expected by iterator +// (i.e. dtypes of the operands) with the actual type of the arguments +// (and returns) of func_t +template::arity> +struct needs_dynamic_casting { + static bool check(TensorIteratorBase& iter) { + using traits = function_traits; + using cpp_type = typename traits::template arg::type; + using cpp_map = c10::CppTypeToScalarType; + + if (iter.input_dtype(nargs-1) != cpp_map::value) { + return true; + } + return needs_dynamic_casting::check(iter); + } +}; + +template +struct needs_dynamic_casting { + static bool check(TensorIteratorBase& iter) { + using traits = function_traits; + using cpp_type = typename traits::result_type; + + // we could assert output numbers are correct here, but checks + // (including arity) are currently pushed outside of this struct. + if constexpr (std::is_void_v) { + return false; + } else { + return iter.dtype(0) != c10::CppTypeToScalarType::value; + } + } +}; + +} //namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorProperties.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorProperties.h new file mode 100644 index 0000000000000000000000000000000000000000..87aca85fb3af1da0db523300a8cb3b310a0a88ad --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorProperties.h @@ -0,0 +1,12 @@ +#pragma once + +// See NOTE: [Tensor vs. TensorBase] +namespace at { +class TensorBase; +} + +namespace at::native { + +TORCH_API bool cudnn_is_acceptable(const TensorBase& self); + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorShape.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorShape.h new file mode 100644 index 0000000000000000000000000000000000000000..638ddba40525e012156b73e3996b0c54922f5772 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorShape.h @@ -0,0 +1,105 @@ +#pragma once +#include +#include +#include + +namespace at::native { + +TORCH_API at::Tensor clone_preserve_strides(const at::Tensor& self); + +inline bool cat_should_skip_tensor(const Tensor& t) { + return t.numel() == 0 && t.dim() == 1; +} + + // Check to see if the shape of tensors is compatible + // for being concatenated along a given dimension. +inline void check_cat_shape_except_dim(const Tensor & first, const Tensor & second, int64_t dimension, int64_t index) { + int64_t first_dims = first.dim(); + int64_t second_dims = second.dim(); + TORCH_CHECK(first_dims == second_dims, "Tensors must have same number of dimensions: got ", + first_dims, " and ", second_dims); + for (const auto dim : c10::irange(first_dims)) { + if (dim == dimension) { + continue; + } + int64_t first_dim_size = first.sizes()[dim]; + int64_t second_dim_size = second.sizes()[dim]; + TORCH_CHECK(first_dim_size == second_dim_size, "Sizes of tensors must match except in dimension ", + dimension, ". Expected size ", static_cast(first_dim_size), " but got size ", static_cast(second_dim_size), " for tensor number ", index, " in the list."); + } + } + +inline void check_cat_no_zero_dim(const MaterializedITensorListRef& tensors) { + int64_t i = 0; + for(const Tensor& t : tensors) { + TORCH_CHECK(t.dim() > 0, + "zero-dimensional tensor (at position ", i, ") cannot be concatenated"); + i++; + } +} + +inline int64_t get_num_splits(const Tensor& self, int64_t split_size, int64_t dim) { + TORCH_CHECK(self.dim() != 0, "split expects at least a 1-dimensional tensor"); + TORCH_CHECK(split_size >= 0, "split expects split_size be non-negative, but got split_size=", split_size); + int64_t dim_size = self.size(dim); + TORCH_CHECK(split_size > 0 || dim_size == 0, + "split_size can only be 0 if dimension size is 0, " + "but got dimension size of ", dim_size); + // if split_size is 0 and dimension size is 0, there is 1 split. + int64_t num_splits = 1; + if (split_size != 0) { + // ensuring num_splits is at least 1 makes consistent the case where split_size > dim_size + // (returns a single split). We might want to error here, but keep it for BC. + num_splits = std::max((dim_size + split_size - 1) / split_size, 1); + } + return num_splits; +} + +inline bool have_same_ndims(TensorList tensors) { + auto ndim = tensors[0].dim(); + for (const auto tensor_idx : c10::irange(tensors.size())) { + if(tensors[tensor_idx].dim() != ndim) { + return false; + } + } + return true; +} + +inline void leading_dimension_matches(TensorList tensors, int64_t dim) { + auto tensor_zero_size = tensors[0].sizes(); + std::vector leading_dim_sizes(tensor_zero_size.begin(), tensor_zero_size.begin() + dim); + for (const auto i : c10::irange(tensors.size())) { + at::Tensor tensor = tensors[i]; + for(const auto j : c10::irange(dim)) { + TORCH_CHECK( + tensor.size(j) == leading_dim_sizes[j], + "_chunk_cat expects same sizes of 0,...,dim-1 dimensions for all tensors" + ); + } + } +} + +inline int64_t preprocess_chunk_cat_inputs(TensorList tensors, int64_t dim, int64_t num_chunks) { + TORCH_CHECK(num_chunks >= 1, "_chunk_cat expects positive num_chunks"); + TORCH_CHECK(!tensors.empty(), + "_chunk_cat expects a non-empty input tensor list"); + auto expected_dtype = tensors[0].dtype(); + auto expected_device = tensors[0].device(); + for(const auto i : c10::irange(tensors.size())) { + TORCH_CHECK(tensors[i].numel() > 0, "_chunk_cat expects non-empty tensor"); + TORCH_CHECK(tensors[i].dtype() == expected_dtype, "_chunk_cat expects all input tensors with the same dtype"); + TORCH_CHECK(tensors[i].device() == expected_device, "_chunk_cat expects all inputs tensors on the same device"); + } + if (have_same_ndims(tensors)) { + dim = maybe_wrap_dim(dim, tensors[0].dim()); + } else { + TORCH_CHECK(dim >= 0, "_chunk_cat expects non-negative dim when input tensors have different ndims") + for(const auto i : c10::irange(tensors.size())) { + TORCH_CHECK(dim < tensors[i].ndimension(), "_chunk_cat expects dim < ndim for all input tensors"); + } + } + leading_dimension_matches(tensors, dim); + return dim; +} + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TopKImpl.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TopKImpl.h new file mode 100644 index 0000000000000000000000000000000000000000..0a11f5f4087536c928c6294e92ce6cae03bd1378 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TopKImpl.h @@ -0,0 +1,98 @@ +#pragma once +#include +#include + +namespace at::native { + +#ifdef CPU_CAPABILITY +inline namespace CPU_CAPABILITY { +#else +inline namespace DEFAULT { +#endif + +// Core topk loop, shared between CPU and QuantizedCPU +template +void topk_impl_loop( + const int64_t mode_values_stride, + const int64_t mode_indices_stride, + const int64_t tmp_values_stride, + const int64_t k, + const int64_t dim_size, + const bool largest, + const bool sorted, + char** data, const int64_t* strides, const int64_t n) { + + // If k is zero, then output values and indices are empty tensors + // So iterating over other dims is pointless + if (k == 0) { + return; + } + using elem_t = std::pair; + std::vector queue(dim_size); + for (const auto i : c10::irange(n)) { + TensorAccessor mode_values( + reinterpret_cast(data[0] + i * strides[0]), + &k, &mode_values_stride); + TensorAccessor mode_indices( + reinterpret_cast(data[1] + i * strides[1]), + &k, &mode_indices_stride); + TensorAccessor tmp_values( + reinterpret_cast(data[2] + i * strides[2]), + &dim_size, &tmp_values_stride); + + auto n_2 = dim_size; + auto use_partial_sort = k * 64 <= n_2; + + for (const auto j : c10::irange(n_2)) { + queue[j].first = tmp_values[j]; + queue[j].second = j; + } + + // we want nan to be sorted as top for numpy compatibility + if (use_partial_sort) { + if (largest) { + std::partial_sort(queue.begin(), queue.begin() + k, queue.end(), + [](const elem_t& x, const elem_t& y) -> bool { + return ((_isnan(x.first) && !_isnan(y.first)) || (x.first > y.first)); + }); + } else { + std::partial_sort(queue.begin(), queue.begin() + k, queue.end(), + [](const elem_t& x, const elem_t& y) -> bool { + return ((!_isnan(x.first) && _isnan(y.first)) || (x.first < y.first)); + }); + } + } else { + if (largest) { + std::nth_element(queue.begin(), queue.begin() + k - 1, queue.end(), + [](const elem_t& x, const elem_t& y) -> bool { + return ((_isnan(x.first) && !_isnan(y.first)) || (x.first > y.first)); + }); + if (sorted) { + std::sort(queue.begin(), queue.begin() + k - 1, + [](const elem_t& x, const elem_t& y) -> bool { + return ((_isnan(x.first) && !_isnan(y.first)) || (x.first > y.first)); + }); + } + } else { + std::nth_element(queue.begin(), queue.begin() + k -1, queue.end(), + [](const elem_t& x, const elem_t& y) -> bool { + return ((!_isnan(x.first) && _isnan(y.first)) || (x.first < y.first)); + }); + if (sorted) { + std::sort(queue.begin(), queue.begin() + k -1, + [](const elem_t& x, const elem_t& y) -> bool { + return ((!_isnan(x.first) && _isnan(y.first)) || (x.first < y.first)); + }); + } + } + } + + for (const auto j : c10::irange(k)) { + mode_values[j] = queue[j].first; + mode_indices[j] = queue[j].second; + } + } +} + +} // namespace CPU_CAPABILITY +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TransposeType.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TransposeType.h new file mode 100644 index 0000000000000000000000000000000000000000..603bf6fee60aa2bc1850fae3eb0dac73345d7fb9 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TransposeType.h @@ -0,0 +1,23 @@ +#pragma once +#include + +namespace at::native { + +// Used as an interface between the different BLAS-like libraries +enum class TransposeType { + NoTranspose, + Transpose, + ConjTranspose, +}; + +// Transforms TransposeType into the BLAS / LAPACK format +static inline char to_blas(TransposeType trans) { + switch (trans) { + case TransposeType::Transpose: return 'T'; + case TransposeType::NoTranspose: return 'N'; + case TransposeType::ConjTranspose: return 'C'; + } + TORCH_INTERNAL_ASSERT(false, "Invalid transpose type"); +} + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Unfold3d.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Unfold3d.h new file mode 100644 index 0000000000000000000000000000000000000000..90ead9d1f7ad48187b8c0a28af2fd59915887d17 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Unfold3d.h @@ -0,0 +1,49 @@ +#pragma once + +#include + +namespace at::native { + +void Unfold3dCopyCPU( + ScalarType dtype, + const void *src, + int64_t C, + int64_t X_D, + int64_t X_H, + int64_t X_W, + int64_t Y_D, + int64_t Y_H, + int64_t Y_W, + int64_t kernel_d, + int64_t kernel_h, + int64_t kernel_w, + int64_t stride_d, + int64_t stride_h, + int64_t stride_w, + int64_t pad_d, + int64_t pad_h, + int64_t pad_w, + void* dst); + +void Unfold3dAccCPU( + ScalarType dtype, + const void *src, + int64_t C, + int64_t X_D, + int64_t X_H, + int64_t X_W, + int64_t Y_D, + int64_t Y_H, + int64_t Y_W, + int64_t kernel_d, + int64_t kernel_h, + int64_t kernel_w, + int64_t stride_d, + int64_t stride_h, + int64_t stride_w, + int64_t pad_d, + int64_t pad_h, + int64_t pad_w, + void *dst); + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/UnfoldBackward.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/UnfoldBackward.h new file mode 100644 index 0000000000000000000000000000000000000000..7ff39f84c6fddb01ff806b3a07d4c7ccb29c708c --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/UnfoldBackward.h @@ -0,0 +1,112 @@ +#pragma once + +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + +namespace at::native { + +using unfold_backward_fn = void (*)( + Tensor& grad_in, + const Tensor& grad, + int64_t dim, + int64_t size, + int64_t step +); + +DECLARE_DISPATCH(unfold_backward_fn, unfold_backward_stub); + +namespace { + +// Note on naming: it is unconventional. +// grad_in does not mean that it is a gradient wrt to input, +// grad_in/grad_out is just an input/output of unfold_backward kernel. + +static C10_UNUSED TensorIterator _make_unfold_backward_iter_over_grad_out( + Tensor& grad_out, + const Tensor& grad_in, + int64_t dim, + int64_t size, + int64_t step +) { + dim = maybe_wrap_dim(dim, grad_out.dim()); + // last dim stores the folds + + auto grad_out_dim_size = ensure_nonempty_size(grad_out, dim); + auto grad_in_dim_size = ensure_nonempty_size(grad_in, dim); + // dictates the number of elements to iterate over + // in dimension `dim` + auto iter_dim_size = std::min( + grad_out_dim_size, + (grad_in_dim_size - 1) * step + size + ); + + /* prepare grad_out for TensorIterator { */ + auto grad_out_strides = ensure_nonempty_vec(grad_out.strides().vec()); + auto grad_out_sizes = ensure_nonempty_vec(grad_out.sizes().vec()); + grad_out_sizes[dim] = iter_dim_size; + auto grad_out_restrided = grad_out.as_strided( + grad_out_sizes, grad_out_strides + ); + /* } */ + + /* prepare grad_in for TensorIterator { */ + auto grad_in_strides = ensure_nonempty_vec(grad_in.strides().vec()); + auto grad_in_sizes = ensure_nonempty_vec(grad_in.sizes().vec()); + + // set strides for dim to 0 + // and size to 1 because + // this dimension is indexed inside the kernel + grad_in_strides[dim] = 0; + grad_in_sizes[dim] = 1; + + grad_in_strides.pop_back(); + grad_in_sizes.pop_back(); + + auto grad_in_restrided = grad_in.squeeze(-1).as_strided( + grad_in_sizes, grad_in_strides + ); + /* } */ + + // During the TensorIterator iteration we have to know + // i_dim in grad_out[i_1,...,i_dim,...i_n], + // idx_dim stores this information + /* prepare idx_dim for TensorIterator { */ + auto idx_dim = at::arange( + 0, iter_dim_size, grad_in.options().dtype(at::kLong) + ); + + auto grad_out_dim = ensure_nonempty_dim(grad_out.dim()); + + auto idx_dim_strides = std::vector(grad_out_dim, 0); + auto idx_dim_sizes = std::vector(grad_out_dim, 1); + + idx_dim_strides[dim] = 1; + idx_dim_sizes[dim] = iter_dim_size; + + // idx_dim size will broadcast over determined by grad_out sizes in TensorIterator + auto idx_dim_restrided = idx_dim.as_strided(idx_dim_sizes, idx_dim_strides); + /* } */ + + auto iter = TensorIteratorConfig() + .set_check_mem_overlap(false) + .check_all_same_dtype(false) + .resize_outputs(false) + .add_owned_output(grad_out_restrided) + .add_owned_input(grad_in_restrided) + .add_owned_input(idx_dim_restrided) + .build(); + + return iter; +} + +} + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/UpSample.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/UpSample.h new file mode 100644 index 0000000000000000000000000000000000000000..8dadc7cee3ae4919034aab308eae31ed71762624 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/UpSample.h @@ -0,0 +1,506 @@ +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * Note [compute_scales_value] + * Note [area_pixel_compute_scale] + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * Interpolate with scale_factor can have different behaviors + * depending on the value of recompute_scale_factor: + * + * - With recompute_scale_factor = True (current default behavior): + * the scale_factor, when provided by the user, are used to calculate + * the output size. The input size and the computed output_size + * are then used to infer new values for the scales which are + * used in the interpolation. Because floating-point math is not exact, + * this may be a different value from the user-supplied scales. + * + * - With recompute_scale_factor = False (which will be the default + * behavior starting 1.5.0): + * the behavior follows opencv logic, and the scales provided by + * the user are the ones used in the interpolation calculations. + * + * If the scales are not provided or if they are provided but + * recompute_scale_factor is set to True (default behavior), the scales + * are computed from the input and the output size; + * + * + * When the scales are inferred from the input and output sizes, + * we view each pixel as an area, idx + 0.5 as its center index. + * Here is an example formula in 1D case. + * if align_corners: center of two corner pixel areas are preserved, + * (0.5, 0.5) -> (0.5, 0.5), + * (input_size - 0.5, 0.5) -> (output_size - 0.5) + * scale = (input_size - 0.5 - 0.5) / (output_size - 0.5 - 0.5) + * src_index + 0.5 - 0.5 = scale * (dst_index + 0.5 - 0.5) + * if not align_corners: the whole range is scaled accordingly + * scale = input_size / output_size + * src_idx + 0.5 = scale * (dst_index + 0.5) + */ + +namespace at::native { + +namespace upsample { + +TORCH_API c10::SmallVector compute_output_size( + c10::IntArrayRef input_size, // Full input tensor size. + at::OptionalIntArrayRef output_size, + c10::optional> scale_factors); + +inline c10::optional get_scale_value(c10::optional> scales, int idx) { + if (!scales) { + return c10::nullopt; + } + return scales->at(idx); +} + +} // namespace upsample + +using scale_t = c10::optional; +using upsampling_nearest1d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_w); +using _upsampling_nearest_exact1d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_w); +using upsampling_nearest2d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_h, scale_t scales_w); +using _upsampling_nearest_exact2d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_h, scale_t scales_w); +using upsampling_nearest3d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_d, scale_t scales_h, scale_t scales_w); +using _upsampling_nearest_exact3d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_d, scale_t scales_h, scale_t scales_w); +using upsampling_linear1d = void(*)(const Tensor& output, const Tensor& input, bool align_corners, scale_t scales_w); +using upsampling_bilinear2d = void(*)(const Tensor& output, const Tensor& input, bool align_corners, scale_t scales_h, scale_t scales_w); +using _upsampling_bilinear2d_aa = void(*)(const Tensor& output, const Tensor& input, bool align_corners, scale_t scales_h, scale_t scales_w); +using upsampling_trilinear3d = void(*)(const Tensor& output, const Tensor& input, bool align_corners, scale_t scales_d, scale_t scales_h, scale_t scales_w); +using upsampling_bicubic2d = void(*)(const Tensor& output, const Tensor& input, bool align_corners, scale_t scales_h, scale_t scales_w); +using _upsampling_bicubic2d_aa = void(*)(const Tensor& output, const Tensor& input, bool align_corners, scale_t scales_h, scale_t scales_w); +DECLARE_DISPATCH(upsampling_nearest1d, upsample_nearest1d_kernel); +DECLARE_DISPATCH(_upsampling_nearest_exact1d, _upsample_nearest_exact1d_kernel); +DECLARE_DISPATCH(upsampling_nearest2d, upsample_nearest2d_kernel); +DECLARE_DISPATCH(_upsampling_nearest_exact2d, _upsample_nearest_exact2d_kernel); +DECLARE_DISPATCH(upsampling_nearest3d, upsample_nearest3d_kernel); +DECLARE_DISPATCH(_upsampling_nearest_exact3d, _upsample_nearest_exact3d_kernel); +DECLARE_DISPATCH(upsampling_nearest1d, upsample_nearest1d_backward_kernel); +DECLARE_DISPATCH(_upsampling_nearest_exact1d, _upsample_nearest_exact1d_backward_kernel); +DECLARE_DISPATCH(upsampling_nearest2d, upsample_nearest2d_backward_kernel); +DECLARE_DISPATCH(_upsampling_nearest_exact2d, _upsample_nearest_exact2d_backward_kernel); +DECLARE_DISPATCH(upsampling_nearest3d, upsample_nearest3d_backward_kernel); +DECLARE_DISPATCH(_upsampling_nearest_exact3d, _upsample_nearest_exact3d_backward_kernel); +DECLARE_DISPATCH(upsampling_linear1d, upsample_linear1d_kernel); +DECLARE_DISPATCH(upsampling_bilinear2d, upsample_bilinear2d_kernel); +DECLARE_DISPATCH(_upsampling_bilinear2d_aa, _upsample_bilinear2d_aa_kernel); +DECLARE_DISPATCH(upsampling_trilinear3d, upsample_trilinear3d_kernel); +DECLARE_DISPATCH(upsampling_linear1d, upsample_linear1d_backward_kernel); +DECLARE_DISPATCH(upsampling_bilinear2d, upsample_bilinear2d_backward_kernel); +DECLARE_DISPATCH(_upsampling_bilinear2d_aa, _upsample_bilinear2d_aa_backward_kernel); +DECLARE_DISPATCH(upsampling_trilinear3d, upsample_trilinear3d_backward_kernel); +DECLARE_DISPATCH(upsampling_bicubic2d, upsample_bicubic2d_kernel); +DECLARE_DISPATCH(_upsampling_bicubic2d_aa, _upsample_bicubic2d_aa_kernel); +DECLARE_DISPATCH(_upsampling_bicubic2d_aa, _upsample_bicubic2d_aa_backward_kernel); + +static C10_UNUSED std::array upsample_1d_common_check(IntArrayRef input_size, IntArrayRef output_size) { + TORCH_CHECK( + output_size.size() == 1, + "It is expected output_size equals to 1, but got size ", + output_size.size()); + + TORCH_CHECK( + input_size.size() == 3, + "It is expected input_size equals to 3, but got size ", + input_size.size()); + + int64_t output_width = output_size[0]; + + int64_t nbatch = input_size[0]; + int64_t channels = input_size[1]; + int64_t input_width = input_size[2]; + + TORCH_CHECK( + input_width > 0 && output_width > 0, + "Input and output sizes should be greater than 0, but got input (W: ", + input_width, + ") and output (W: ", + output_width, + ")"); + + return {nbatch, channels, output_width}; +} + +static C10_UNUSED std::array upsample_2d_common_check(IntArrayRef input_size, IntArrayRef output_size) { + TORCH_CHECK( + output_size.size() == 2, + "It is expected output_size equals to 2, but got size ", + output_size.size()); + + TORCH_CHECK( + input_size.size() == 4, + "It is expected input_size equals to 4, but got size ", + input_size.size()); + + int64_t output_height = output_size[0]; + int64_t output_width = output_size[1]; + + int64_t nbatch = input_size[0]; + int64_t channels = input_size[1]; + int64_t input_height = input_size[2]; + int64_t input_width = input_size[3]; + + TORCH_CHECK( + input_height > 0 && input_width > 0 && output_height > 0 && + output_width > 0, + "Input and output sizes should be greater than 0," + " but got input (H: ", + input_height, + ", W: ", + input_width, + ") output (H: ", + output_height, + ", W: ", + output_width, + ")"); + + return {nbatch, channels, output_height, output_width}; +} + +static C10_UNUSED +std::array upsample_3d_common_check(IntArrayRef input_size, IntArrayRef output_size) { + TORCH_CHECK( + output_size.size() == 3, + "It is expected output_size equals to 3, but got size ", + output_size.size()); + + TORCH_CHECK( + input_size.size() == 5, + "It is expected input_size equals to 5, but got size ", + input_size.size()); + + int64_t output_depth = output_size[0]; + int64_t output_height = output_size[1]; + int64_t output_width = output_size[2]; + + int64_t nbatch = input_size[0]; + int64_t channels = input_size[1]; + int64_t input_depth = input_size[2]; + int64_t input_height = input_size[3]; + int64_t input_width = input_size[4]; + + TORCH_CHECK( + input_depth > 0 && input_height > 0 && input_width > 0 && + output_depth > 0 && output_height > 0 && output_width > 0, + "Input and output sizes should be greater than 0, but got input (D: ", + input_depth, + ", H: ", + input_height, + ", W: ", + input_width, + ") output (D: ", + output_depth, + ", H: ", + output_height, + ", W: ", + output_width, + ")"); + + + return {nbatch, channels, output_depth, output_height, output_width}; +} + +static inline void upsample_2d_shape_check( + const Tensor& input, + const Tensor& grad_output, + int64_t nbatch, + int64_t nchannels, + int64_t input_height, + int64_t input_width, + int64_t output_height, + int64_t output_width) { + TORCH_CHECK( + input_height > 0 && input_width > 0 && output_height > 0 && + output_width > 0, + "Input and output sizes should be greater than 0," + " but got input (H: ", + input_height, + ", W: ", + input_width, + ") output (H: ", + output_height, + ", W: ", + output_width, + ")"); + + if (input.defined()) { + // Allow for empty batch size but not other dimensions + TORCH_CHECK( + (input.numel() != 0 || + (input.size(1) != 0 && input.size(2) != 0 && input.size(3) != 0) + ) && + input.dim() == 4, + "Non-empty 4D data tensor expected but got a tensor with sizes ", + input.sizes()); + } else if (grad_output.defined()) { + check_dim_size(grad_output, 4, 0, nbatch); + check_dim_size(grad_output, 4, 1, nchannels); + check_dim_size(grad_output, 4, 2, output_height); + check_dim_size(grad_output, 4, 3, output_width); + } +} + +template +static inline scalar_t compute_scales_value( + const c10::optional scale, + int64_t input_size, + int64_t output_size) { + // see Note [compute_scales_value] + // FIXME: remove magic > 0 after we ensure no models were serialized with -1 defaults. + return (scale.has_value() && scale.value() > 0.) + ? static_cast(1.0 / scale.value()) + : (static_cast(input_size) / output_size); +} + +template +static inline scalar_t area_pixel_compute_scale( + int64_t input_size, + int64_t output_size, + bool align_corners, + const c10::optional scale) { + // see Note [area_pixel_compute_scale] + if(align_corners) { + if(output_size > 1) { + return static_cast(input_size - 1) / (output_size - 1); + } else { + return static_cast(0); + } + } else { + return compute_scales_value(scale, input_size, output_size); + } +} + +template +static inline scalar_t area_pixel_compute_source_index( + scalar_t scale, + int64_t dst_index, + bool align_corners, + bool cubic) { + if (align_corners) { + return scale * dst_index; + } else { + scalar_t src_idx = scale * (dst_index + static_cast(0.5)) - + static_cast(0.5); + // [Note] Follow Opencv resize logic: + // We allow negative src_idx here and later will use + // dx = src_idx - floorf(src_idx) + // to compute the "distance"(which affects weights). + // For linear modes, weight distribution doesn't matter + // for negative indices as they use 2 pixels to interpolate. + // For example, [-1, 0], they both use pixel 0 value so it + // doesn't affect if we bound the src_idx to 0 or not. + // TODO: Our current linear mode impls use unbound indices + // where we should and then remove this cubic flag. + // This matters in cubic mode, as we might need [-1, 0, 1, 2] + // to interpolate and the weights can be affected. + return (!cubic && src_idx < static_cast(0)) ? scalar_t(0) + : src_idx; + } +} + +static inline int64_t nearest_neighbor_compute_source_index( + const float scale, + int64_t dst_index, + int64_t input_size) { + // Index computation matching OpenCV INTER_NEAREST + // which is buggy and kept for BC + const int64_t src_index = + std::min(static_cast(floorf(dst_index * scale)), input_size - 1); + return src_index; +} + +static inline int64_t nearest_neighbor_exact_compute_source_index( + const float scale, + int64_t dst_index, + int64_t input_size) { + // index_f32 = (output_index + 0.5) * scale - 0.5 + // input_index = round(index_f32) + // Same as Pillow and Scikit-Image/Scipy ndi.zoom + const int64_t src_index = + std::min(static_cast(floorf((dst_index + 0.5) * scale)), input_size - 1); + return src_index; +} + +static inline int64_t nearest_idx( + int64_t output_index, + int64_t input_size, + int64_t output_size, + c10::optional scales) { + // This method specificly treats cases: output_size == input_size or + // output_size == 2 * input_size, that we would like to get rid of + // We keep this method for BC and consider as deprecated. + // See nearest_exact_idx as replacement + if (output_size == input_size) { + // scale_factor = 1, simply copy + return output_index; + } else if (output_size == 2 * input_size) { + // scale_factor = 2, shift input index + return output_index >> 1; + } else { + float scale = compute_scales_value(scales, input_size, output_size); + return nearest_neighbor_compute_source_index(scale, output_index, input_size); + } +} + +static inline int64_t nearest_exact_idx( + int64_t output_index, + int64_t input_size, + int64_t output_size, + c10::optional scales) { + float scale = compute_scales_value(scales, input_size, output_size); + return nearest_neighbor_exact_compute_source_index(scale, output_index, input_size); +} + +// Define a typedef to dispatch to nearest_idx or nearest_exact_idx +typedef int64_t (*nearest_idx_fn_t)(int64_t, int64_t, int64_t, c10::optional); + +template +static scalar_t upsample_get_value_bounded( + scalar_t* data, + int64_t width, + int64_t height, + int64_t x, + int64_t y) { + int64_t access_x = std::max(std::min(x, width - 1), static_cast(0)); + int64_t access_y = std::max(std::min(y, height - 1), static_cast(0)); + return data[access_y * width + access_x]; +} + +template +static void upsample_increment_value_bounded( + scalar_t* data, + int64_t width, + int64_t height, + int64_t x, + int64_t y, + scalar_t value) { + int64_t access_x = std::max(std::min(x, width - 1), static_cast(0)); + int64_t access_y = std::max(std::min(y, height - 1), static_cast(0)); + data[access_y * width + access_x] += value; +} + +// Based on +// https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm +template +static inline scalar_t cubic_convolution1(scalar_t x, scalar_t A) { + return ((A + 2) * x - (A + 3)) * x * x + 1; +} + +template +static inline scalar_t cubic_convolution2(scalar_t x, scalar_t A) { + return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A; +} + +template +static inline void get_cubic_upsample_coefficients( + scalar_t coeffs[4], + scalar_t t) { + scalar_t A = -0.75; + + scalar_t x1 = t; + coeffs[0] = cubic_convolution2(x1 + 1.0, A); + coeffs[1] = cubic_convolution1(x1, A); + + // opposite coefficients + scalar_t x2 = 1.0 - t; + coeffs[2] = cubic_convolution1(x2, A); + coeffs[3] = cubic_convolution2(x2 + 1.0, A); +} + +template +static inline scalar_t cubic_interp1d( + scalar_t x0, + scalar_t x1, + scalar_t x2, + scalar_t x3, + scalar_t t) { + scalar_t coeffs[4]; + get_cubic_upsample_coefficients(coeffs, t); + + return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3]; +} + +// when `real_input_index` becomes larger than the range the floating point +// type can accurately represent, the type casting to `int64_t` might exceed +// `input_size`, causing overflow. So we guard it with `std::min` below. +template +static inline void guard_index_and_lambda(const opmath_t& real_input_index, const int64_t& input_size, int64_t& input_index, scalar_t& lambda) { + input_index = std::min(static_cast(floorf(real_input_index)), input_size - 1); + lambda = std::min( + std::max(real_input_index - input_index, static_cast(0)), + static_cast(1) + ); +} + +template +static inline void compute_source_index_and_lambda( + int64_t& input_index0, + int64_t& input_index1, + scalar_t& lambda0, + scalar_t& lambda1, + opmath_t ratio, + int64_t output_index, + int64_t input_size, + int64_t output_size, + bool align_corners) { + if (output_size == input_size) { + // scale_factor = 1, simply copy + input_index0 = output_index; + input_index1 = output_index; + lambda0 = static_cast(1); + lambda1 = static_cast(0); + } else { + const auto real_input_index = + area_pixel_compute_source_index( + ratio, output_index, align_corners, /*cubic=*/false); + guard_index_and_lambda(real_input_index, input_size, input_index0, lambda1); + int64_t offset = (input_index0 < input_size - 1) ? 1 : 0; + input_index1 = input_index0 + offset; + lambda0 = static_cast(1.) - lambda1; + } +} + +// It will not be used by data types other than BFloat16 and Half. +template || !std::is_same::value, int> = 0> +void inline apply_grad_input(scalar_in* buffer_ptr, scalar_out* gin, int64_t size) { + TORCH_CHECK((is_reduced_floating_point_v), + "Upsample backward only support BFloat16 and Half in the lower precision data types on CPU.") + TORCH_CHECK((std::is_same::value), + "Upsample backward should use float as acc buffer for BFloat16 and Half grad input on CPU.") + return; +} + +template && std::is_same::value, int> = 0> +void inline apply_grad_input(scalar_in* buffer_ptr, scalar_out* gin, int64_t size) { + using bVec = Vectorized; + using fVec = Vectorized; + int64_t d = 0; + for (; d < size - (size % bVec::size()); d += bVec::size()) { + bVec gin_bvec = bVec::loadu(gin + d); + fVec gin_fvec0, gin_fvec1; + std::tie(gin_fvec0, gin_fvec1) = convert_to_float(gin_bvec); + gin_fvec0 += fVec::loadu(buffer_ptr + d); + gin_fvec1 += fVec::loadu(buffer_ptr + d + fVec::size()); + fVec(0).store(buffer_ptr + d); + fVec(0).store(buffer_ptr + d + fVec::size()); + convert_from_float(gin_fvec0, gin_fvec1).store(gin + d); + } + for (; d < size; d++) { + gin[d] += buffer_ptr[d]; + buffer_ptr[d] = 0; + } +} + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/BinaryInternal.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/BinaryInternal.h new file mode 100644 index 0000000000000000000000000000000000000000..e098d32b114d604f6d9a1b5160dbe87de52c4595 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/BinaryInternal.h @@ -0,0 +1,48 @@ +// DON'T include this except from Binary*.cu files. It should not leak into +// headers. +#pragma once +#define TORCH_ASSERT_NO_OPERATORS +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace at { +namespace native { +namespace binary_internal { + +template +struct DivFunctor { + __device__ scalar_t operator()(scalar_t a, scalar_t b) const { + return a / b; + } +}; + +template +struct MulFunctor { + __device__ T operator()(T a, T b) const { + return a * b; + } +}; + +// Workaround for the error: '*' in boolean context, suggest '&&' instead +// [-Werror=int-in-bool-context] +template <> +struct MulFunctor { + __device__ bool operator()(bool a, bool b) const { + return a && b; + } +}; +void div_true_kernel_cuda(TensorIteratorBase& iter); +void div_trunc_kernel_cuda(TensorIteratorBase& iter); +} // namespace binary_internal +} // namespace native +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/CompositeRandomAccessor.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/CompositeRandomAccessor.h new file mode 100644 index 0000000000000000000000000000000000000000..d47a7fa776f1b681b26dc5ec8b4548604d359946 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/CompositeRandomAccessor.h @@ -0,0 +1,35 @@ +#pragma once + +#include +#include + +namespace at { namespace native { + +struct TupleInfoCPU { + template + using tuple = thrust::tuple; + + template + static constexpr auto tie(Types&... args) noexcept { + return thrust::tie(args...); + } +}; + +template +using CompositeRandomAccessorCPU = + CompositeRandomAccessor; + +template +void swap( + references_holder rh1, + references_holder rh2 +) { + return thrust::swap(rh1.data(), rh2.data()); +} + +template +auto get(references_holder rh) -> decltype(thrust::get(rh.data())) { + return thrust::get(rh.data()); +} + +}} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/CuFFTPlanCache.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/CuFFTPlanCache.h new file mode 100644 index 0000000000000000000000000000000000000000..6bcd57027d51725aa3f315b9c35b9db126fc76c9 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/CuFFTPlanCache.h @@ -0,0 +1,494 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace at { namespace native { namespace detail { + +// Enum representing the FFT type +enum class CuFFTTransformType : int8_t { + C2C, // Complex-to-complex + R2C, // Real-to-complex + C2R, // Complex-to-real +}; + +// This struct is used to let us easily compute hashes of the +// parameters. +// It will be the **key** to the plan cache. +struct CuFFTParams +{ + int64_t signal_ndim_; // between 1 and max_rank, i.e., 1 <= signal_ndim <= 3 + // These include additional batch dimension as well. + int64_t sizes_[max_rank + 1]; + int64_t input_strides_[max_rank + 1]; + int64_t output_strides_[max_rank + 1]; + CuFFTTransformType fft_type_; + ScalarType value_type_; + + CuFFTParams() = default; + + CuFFTParams(IntArrayRef in_strides, IntArrayRef out_strides, + IntArrayRef signal_sizes, CuFFTTransformType fft_type, ScalarType value_type) { + // Padding bits must be zeroed for hashing + memset(this, 0, sizeof(*this)); + signal_ndim_ = signal_sizes.size() - 1; + fft_type_ = fft_type; + value_type_ = value_type; + + TORCH_INTERNAL_ASSERT(in_strides.size() == signal_sizes.size()); + TORCH_INTERNAL_ASSERT(out_strides.size() == signal_sizes.size()); + TORCH_INTERNAL_ASSERT(1 <= signal_ndim_ && signal_ndim_ <= max_rank); + + std::copy(signal_sizes.cbegin(), signal_sizes.cend(), sizes_); + std::copy(in_strides.cbegin(), in_strides.cend(), input_strides_); + std::copy(out_strides.cbegin(), out_strides.cend(), output_strides_); + } +}; + +static_assert(std::is_trivial::value, ""); + +// Returns true if the transform type has complex input +inline bool cufft_complex_input(CuFFTTransformType type) { + switch (type) { + case CuFFTTransformType::C2C: + case CuFFTTransformType::C2R: + return true; + + case CuFFTTransformType::R2C: + return false; + } + TORCH_INTERNAL_ASSERT(false); +} + +// Returns true if the transform type has complex output +inline bool cufft_complex_output(CuFFTTransformType type) { + switch (type) { + case CuFFTTransformType::C2C: + case CuFFTTransformType::R2C: + return true; + + case CuFFTTransformType::C2R: + return false; + } + TORCH_INTERNAL_ASSERT(false); +} + +// Create transform type enum from bools representing if input and output are complex +inline CuFFTTransformType GetCuFFTTransformType(bool complex_input, bool complex_output) { + if (complex_input && complex_output) { + return CuFFTTransformType::C2C; + } else if (complex_input && !complex_output) { + return CuFFTTransformType::C2R; + } else if (!complex_input && complex_output) { + return CuFFTTransformType::R2C; + } + TORCH_INTERNAL_ASSERT(false, "Real to real FFTs are not supported"); +} + + +class CuFFTHandle { + ::cufftHandle handle_; +public: + + CuFFTHandle() { + CUFFT_CHECK(cufftCreate(&handle_)); + } + + ::cufftHandle & get() { return handle_; } + const ::cufftHandle & get() const { return handle_; } + + ~CuFFTHandle() { +// Not using fftDestroy() for rocFFT to work around double freeing of handles +#if !defined(USE_ROCM) + cufftDestroy(handle_); +#endif + } +}; + +__forceinline__ +static bool is_pow_of_two(int64_t x) { + return (x & (x - 1)) == 0; +} + +using cufft_size_type = long long int; + +using CuFFTDimVector = c10::SmallVector; + +// Struct representing a tensor in CuFFT's data layout for planning transforms +// See NOTE [ cuFFT Embedded Strides ]. +struct CuFFTDataLayout { + CuFFTDimVector embed; + cufft_size_type stride, dist; + bool must_clone, simple; +}; + +// Returns a cufft embedding for a contiguous signal of the given size. +// e.g. if the input is cloned, this will be the resulting data layout +// See NOTE [ cuFFT Embedded Strides ]. +inline CuFFTDataLayout cufft_simple_embed(IntArrayRef sizes, bool onesided) { + CuFFTDataLayout layout; + layout.simple = true; + layout.must_clone = false; + layout.embed.assign(sizes.cbegin() + 1, sizes.cend()); + if (onesided) { + layout.embed.back() = sizes.back() / 2 + 1; + } + layout.stride = 1; + layout.dist = 1; + for (const auto& len : layout.embed) { + layout.dist *= len; + } + return layout; +} + +// Convert strides to a CuFFT embedded representation. +// If strides cannot be embedded, returns a simple layout and sets must_clone flag +// See NOTE [ cuFFT Embedded Strides ]. +inline CuFFTDataLayout as_cufft_embed(IntArrayRef strides, IntArrayRef sizes, bool onesided) { + const auto signal_ndim = strides.size() - 1; + CuFFTDataLayout layout; + auto last_stride = strides[signal_ndim]; + layout.must_clone = (last_stride <= 0); + + const auto last_dim_size = onesided ? + sizes[signal_ndim] / 2 + 1 : sizes[signal_ndim]; + const auto signal_numel = c10::multiply_integers(sizes.slice(1, sizes.size() - 2)) * last_dim_size; + + // Zero stides are not allowed, even if the batch size is one. + // If that happens just set a dummy case + if (sizes[0] == 1) { + layout.dist = signal_numel; + } else if (strides[0] == 0) { + layout.must_clone = true; + } else { + layout.dist = strides[0]; + } + + // Calculate the embedding shape, or set must_clone if the strides cannot be embedded + layout.embed.resize(signal_ndim); + for (auto i = signal_ndim - 1; !layout.must_clone && i > 0; i--) { + auto stride = strides[i]; + if (sizes[i] == 1) { + layout.embed[i] = 1; + } else if (stride > 0 && stride % last_stride == 0) { + layout.embed[i] = stride / last_stride; + last_stride = stride; + } else { + layout.must_clone = true; + } + } + + if (layout.must_clone) { + // If the input needs to be cloned, assume it will be contiguous + layout = cufft_simple_embed(sizes, onesided); + layout.must_clone = true; + } else { + layout.embed[0] = sizes[1]; + layout.stride = strides[signal_ndim]; + // Determine if layout represents a simple embedding (contiguous data) + layout.simple = [&] { + for (const auto i : c10::irange(1, signal_ndim - 1)) { + if (layout.embed[i] != sizes[i + 1]) { + return false; + } + } + + return (layout.stride == 1 && layout.dist == signal_numel && + layout.embed.back() == last_dim_size); + }(); + } + return layout; +} + +// This class contains all the information needed to execute a cuFFT plan: +// 1. the plan +// 2. whether to clone input before executing the plan +// 3. the workspace size needed +// +// This class will be the **value** in the plan cache. +// It **owns** the raw plan via a unique_ptr. +class CuFFTConfig { +public: + + // Only move semantics is enought for this class. Although we already use + // unique_ptr for the plan, still remove copy constructor and assignment op so + // we don't accidentally copy and take perf hit. + CuFFTConfig(const CuFFTConfig&) = delete; + CuFFTConfig& operator=(CuFFTConfig const&) = delete; + + explicit CuFFTConfig(const CuFFTParams& params): + CuFFTConfig( + IntArrayRef(params.input_strides_, params.signal_ndim_ + 1), + IntArrayRef(params.output_strides_, params.signal_ndim_ + 1), + IntArrayRef(params.sizes_, params.signal_ndim_ + 1), + params.fft_type_, + params.value_type_) {} + + // For complex types, strides are in units of 2 * element_size(dtype) + // sizes are for the full signal, including batch size and always two-sided + CuFFTConfig(IntArrayRef in_strides, IntArrayRef out_strides, + IntArrayRef sizes, CuFFTTransformType fft_type, ScalarType dtype): + fft_type_(fft_type), value_type_(dtype) { + + // signal sizes (excluding batch dim) + CuFFTDimVector signal_sizes(sizes.begin() + 1, sizes.end()); + + // input batch size + const int64_t batch = sizes[0]; + const int64_t signal_ndim = sizes.size() - 1; + + // Since cuFFT has limited non-unit stride support and various constraints, we + // use a flag to keep track throughout this function to see if we need to + // input = input.clone(); + +#if defined(USE_ROCM) + // clone input to avoid issues with hipfft clobering the input and failing tests + clone_input = true; +#else + clone_input = false; +#endif + + // For half, base strides on the real part of real-to-complex and + // complex-to-real transforms are not supported. Since our output is always + // contiguous, only need to check real-to-complex case. + if (dtype == ScalarType::Half) { + // cuFFT on half requires compute capability of at least SM_53 + auto dev_prop = at::cuda::getCurrentDeviceProperties(); + TORCH_CHECK(dev_prop->major >= 5 && !(dev_prop->major == 5 && dev_prop->minor < 3), + "cuFFT doesn't support signals of half type with compute " + "capability less than SM_53, but the device containing input half " + "tensor only has SM_", dev_prop->major, dev_prop->minor); + for (const auto i : c10::irange(signal_ndim)) { + TORCH_CHECK(is_pow_of_two(sizes[i + 1]), + "cuFFT only supports dimensions whose sizes are powers of two when" + " computing in half precision, but got a signal size of", + sizes.slice(1)); + } + clone_input |= in_strides.back() != 1; + } + + CuFFTDataLayout in_layout; + if (clone_input) { + in_layout = cufft_simple_embed(sizes, fft_type == CuFFTTransformType::C2R); + } else { + in_layout = as_cufft_embed(in_strides, sizes, fft_type == CuFFTTransformType::C2R); + } + auto out_layout = as_cufft_embed(out_strides, sizes, fft_type == CuFFTTransformType::R2C); + TORCH_INTERNAL_ASSERT(!out_layout.must_clone, "Out strides cannot be represented as CuFFT embedding"); + clone_input |= in_layout.must_clone; + + // Check if we can take advantage of simple data layout. + // + // See NOTE [ cuFFT Embedded Strides ] in native/cuda/SpectralOps.cu. + + const bool simple_layout = in_layout.simple && out_layout.simple; + cudaDataType itype, otype, exec_type; + const auto complex_input = cufft_complex_input(fft_type); + const auto complex_output = cufft_complex_output(fft_type); + if (dtype == ScalarType::Float) { + itype = complex_input ? CUDA_C_32F : CUDA_R_32F; + otype = complex_output ? CUDA_C_32F : CUDA_R_32F; + exec_type = CUDA_C_32F; + } else if (dtype == ScalarType::Double) { + itype = complex_input ? CUDA_C_64F : CUDA_R_64F; + otype = complex_output ? CUDA_C_64F : CUDA_R_64F; + exec_type = CUDA_C_64F; + } else if (dtype == ScalarType::Half) { + itype = complex_input ? CUDA_C_16F : CUDA_R_16F; + otype = complex_output ? CUDA_C_16F : CUDA_R_16F; + exec_type = CUDA_C_16F; + } else { + TORCH_CHECK(false, "cuFFT doesn't support tensor of type: ", dtype); + } + + // disable auto allocation of workspace to use THC allocator + CUFFT_CHECK(cufftSetAutoAllocation(plan(), /* autoAllocate */ 0)); + + size_t ws_size_t; + + // make plan + if (simple_layout) { + // If with unit-stride, we tell cuFFT by setting inembed == onembed == NULL. + // In such case, cuFFT ignores istride, ostride, idist, and odist + // by assuming istride = ostride = 1. + // + // See NOTE [ cuFFT Embedded Strides ] in native/cuda/SpectralOps.cu. + CUFFT_CHECK(cufftXtMakePlanMany(plan(), signal_ndim, signal_sizes.data(), + /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype, + /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, otype, + batch, &ws_size_t, exec_type)); + } else { + CUFFT_CHECK(cufftXtMakePlanMany(plan(), signal_ndim, signal_sizes.data(), + in_layout.embed.data(), in_layout.stride, in_layout.dist, itype, + out_layout.embed.data(), out_layout.stride, out_layout.dist, otype, + batch, &ws_size_t, exec_type)); + } + ws_size = static_cast(ws_size_t); + } + + const cufftHandle &plan() const { return plan_ptr.get(); } + + CuFFTTransformType transform_type() const { return fft_type_; } + ScalarType data_type() const { return value_type_; } + bool should_clone_input() const { return clone_input; } + int64_t workspace_size() const { return ws_size; } + +private: + CuFFTHandle plan_ptr; + bool clone_input; + int64_t ws_size; + CuFFTTransformType fft_type_; + ScalarType value_type_; +}; + +#if defined(USE_ROCM) + // Note that the max plan number for CUDA version < 10 has to be 1023 + // due to a bug that fails on the 1024th plan + constexpr int64_t CUFFT_MAX_PLAN_NUM = 1023; + constexpr int64_t CUFFT_DEFAULT_CACHE_SIZE = CUFFT_MAX_PLAN_NUM; +#else + constexpr int64_t CUFFT_MAX_PLAN_NUM = std::numeric_limits::max(); + // The default max cache size chosen for CUDA version > 10 is arbitrary. + // This number puts a limit on how big of a plan cache should we maintain by + // default. Users can always configure it via cufft_set_plan_cache_max_size. + constexpr int64_t CUFFT_DEFAULT_CACHE_SIZE = 4096; +#endif +static_assert(0 <= CUFFT_MAX_PLAN_NUM && CUFFT_MAX_PLAN_NUM <= std::numeric_limits::max(), + "CUFFT_MAX_PLAN_NUM not in size_t range"); +static_assert(CUFFT_DEFAULT_CACHE_SIZE >= 0 && CUFFT_DEFAULT_CACHE_SIZE <= CUFFT_MAX_PLAN_NUM, + "CUFFT_DEFAULT_CACHE_SIZE not in [0, CUFFT_MAX_PLAN_NUM] range"); + +// This cache assumes that the mapping from key to value never changes. +// This is **NOT** thread-safe. Please use a mutex when using it **AND** the +// value returned from try_emplace_value. +// The contract of using this cache is that try_emplace_value should only be +// used when the max_size is positive. +class CuFFTParamsLRUCache { +public: + using kv_t = typename std::pair; + using map_t = typename std::unordered_map, + typename std::list::iterator, + ParamsHash, + ParamsEqual>; + using map_kkv_iter_t = typename map_t::iterator; + + + CuFFTParamsLRUCache() : CuFFTParamsLRUCache(CUFFT_DEFAULT_CACHE_SIZE) {} + + CuFFTParamsLRUCache(int64_t max_size) { + _set_max_size(max_size); + } + + CuFFTParamsLRUCache(CuFFTParamsLRUCache&& other) noexcept : + _usage_list(std::move(other._usage_list)), + _cache_map(std::move(other._cache_map)), + _max_size(other._max_size) {} + + CuFFTParamsLRUCache& operator=(CuFFTParamsLRUCache&& other) noexcept { + _usage_list = std::move(other._usage_list); + _cache_map = std::move(other._cache_map); + _max_size = other._max_size; + return *this; + } + + // If key is in this cache, return the cached config. Otherwise, emplace the + // config in this cache and return it. + // Return const reference because CuFFTConfig shouldn't be tampered with once + // created. + const CuFFTConfig &lookup(CuFFTParams params) { + AT_ASSERT(_max_size > 0); + + map_kkv_iter_t map_it = _cache_map.find(params); + // Hit, put to list front + if (map_it != _cache_map.end()) { + _usage_list.splice(_usage_list.begin(), _usage_list, map_it->second); + return map_it->second->second; + } + + // Miss + // remove if needed + if (_usage_list.size() >= _max_size) { + auto last = _usage_list.end(); + last--; + _cache_map.erase(last->first); + _usage_list.pop_back(); + } + + // construct new plan at list front, then insert into _cache_map + _usage_list.emplace_front(std::piecewise_construct, + std::forward_as_tuple(params), + std::forward_as_tuple(params)); + auto kv_it = _usage_list.begin(); + _cache_map.emplace(std::piecewise_construct, + std::forward_as_tuple(kv_it->first), + std::forward_as_tuple(kv_it)); + return kv_it->second; + } + + void clear() { + _cache_map.clear(); + _usage_list.clear(); + } + + void resize(int64_t new_size) { + _set_max_size(new_size); + auto cur_size = _usage_list.size(); + if (cur_size > _max_size) { + auto delete_it = _usage_list.end(); + for (size_t i = 0; i < cur_size - _max_size; i++) { + delete_it--; + _cache_map.erase(delete_it->first); + } + _usage_list.erase(delete_it, _usage_list.end()); + } + } + + size_t size() const { return _cache_map.size(); } + + size_t max_size() const noexcept { return _max_size; } + + std::mutex mutex; + +private: + // Only sets size and does value check. Does not resize the data structures. + void _set_max_size(int64_t new_size) { + // We check that 0 <= new_size <= CUFFT_MAX_PLAN_NUM here. Since + // CUFFT_MAX_PLAN_NUM is of type size_t, we need to do non-negativity check + // first. + TORCH_CHECK(new_size >= 0, + "cuFFT plan cache size must be non-negative, but got ", new_size); + TORCH_CHECK(new_size <= CUFFT_MAX_PLAN_NUM, + "cuFFT plan cache size can not be larger than ", CUFFT_MAX_PLAN_NUM, ", but got ", new_size); + _max_size = static_cast(new_size); + } + + std::list _usage_list; + map_t _cache_map; + size_t _max_size; +}; + +// Since ATen is separated into CPU build and CUDA build, we need a way to call +// these functions only when CUDA is loaded. We use CUDA hooks for this purpose +// (at cuda/detail/CUDAHooks.cpp), and call the hooked functions from the actual +// native function counterparts (at native/SpectralOps.cpp), i.e., +// _cufft_get_plan_cache_max_size, _cufft_set_plan_cache_max_size +// _cufft_get_plan_cache_size, and _cufft_clear_plan_cache. +int64_t cufft_get_plan_cache_max_size_impl(DeviceIndex device_index); +void cufft_set_plan_cache_max_size_impl(DeviceIndex device_index, int64_t max_size); +int64_t cufft_get_plan_cache_size_impl(DeviceIndex device_index); +void cufft_clear_plan_cache_impl(DeviceIndex device_index); + +}}} // namespace at::native::detail diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/DeviceSqrt.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/DeviceSqrt.cuh new file mode 100644 index 0000000000000000000000000000000000000000..38a7804015be1822f4012f74319a459daeb5e885 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/DeviceSqrt.cuh @@ -0,0 +1,25 @@ +#pragma once + +namespace at { namespace native { +#if defined(USE_ROCM) +// take these out when ROCm implements std:: math functions +#include +template +static __forceinline__ __device__ scalar_t device_sqrt(scalar_t val); + +template <> +__forceinline__ __device__ float device_sqrt(float val) { + return ::sqrtf(val); +} + +template <> +__forceinline__ __device__ double device_sqrt(double val) { + return ::sqrt(val); +} +#else +template +__forceinline__ __device__ double device_sqrt(scalar_t val) { + return std::sqrt(val); +} +#endif +}} diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/DistributionTemplates.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/DistributionTemplates.h new file mode 100644 index 0000000000000000000000000000000000000000..04a278d83f7632145ced78f040c59d3288f45dc3 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/DistributionTemplates.h @@ -0,0 +1,672 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +namespace { + +// launch bounds used for kernels utilizing TensorIterator +const uint32_t block_size_bound = 256; +const uint32_t grid_size_bound = 4; +// number of randoms given by distributions like curand_uniform4, curand_uniform2_double +// used in calculating philox offset. +const uint32_t curand4_engine_calls = 4; + +// utility function that calculates proper philox_offset +// for distributions utilizing TensorIterator. For distributions using +// TensorIterator, we are using a grid-stride loop with each +// thread yielding one element per thread. For the edge of the grid-stride +// loop, if the tensor size is large, the unroll loop will kick in and the float4 +// from curand4 will start getting utilized (for common tensor sizes, we end up +// using rand.x from each thread). Hence, the philox_offset is +// (number of elements per thread * number of engine calls), which makes +// sure that philox offset increment is not less than the number of randoms used +// in each thread. +std::tuple calc_execution_policy(int64_t total_elements) { + const uint64_t numel = static_cast(total_elements); + const uint32_t block_size = block_size_bound; + const uint32_t unroll = curand4_engine_calls; + dim3 dim_block(block_size); + dim3 grid((numel + block_size - 1) / block_size); + uint32_t blocks_per_sm = at::cuda::getCurrentDeviceProperties()->maxThreadsPerMultiProcessor / block_size; + grid.x = std::min( + static_cast(at::cuda::getCurrentDeviceProperties()->multiProcessorCount) * blocks_per_sm, + grid.x); + //number of times random will be generated per thread, to offset philox counter in thc random state + uint64_t counter_offset = ((numel - 1) / (block_size * grid.x * unroll) + 1) + * curand4_engine_calls; + return std::make_tuple(counter_offset, grid, dim_block); +} + +// grid stride loop kernel for distributions +template +C10_LAUNCH_BOUNDS_2(block_size_bound, grid_size_bound) +__global__ void distribution_elementwise_grid_stride_kernel(int numel, + PhiloxCudaState philox_args, + const dist_t dist_func, + const transform_t transform_func) { + auto seeds = at::cuda::philox::unpack(philox_args); + int idx = blockIdx.x * blockDim.x + threadIdx.x; + curandStatePhilox4_32_10_t state; + curand_init(std::get<0>(seeds), + idx, + std::get<1>(seeds), + &state); + + int rounded_size = ((numel - 1)/(blockDim.x * gridDim.x * unroll_factor)+1) * + blockDim.x * gridDim.x * unroll_factor; + for(int linear_index = idx; linear_index < rounded_size; linear_index += blockDim.x * gridDim.x * unroll_factor) { + auto rand = dist_func(&state); + #pragma unroll + for (int ii = 0; ii < unroll_factor; ii++) { + int li = linear_index + blockDim.x * gridDim.x * ii; + if (li < numel) { + transform_func(li, static_cast((&rand.x)[ii])); + } + } + __syncthreads(); + } +} + +/** + * distribution_nullary_kernel is analogous to gpu_kernel in + * ATen/native/cuda/Loops.cuh. Like gpu_kernel, it uses + * TensorIterator to launch a kernel. However, the differences are + * - it launches a grid-stride loop based kernel. The kernel is not + * generic like elementwise_kernel in Loops.cuh and is specialized + * for the distribution kernels here. + * - For big size tensors, we can launch multiple kernels recursively + * (i.e. if (!iter.can_use_32bit_indexing())) and hence, the philox + * offset calculation is done in this function. + * + * FIXME: Can we specialize elementwise_kernel and launch_kernel in Loops.cuh + * to have grid-stride loop kernel and then use that to launch our distribution + * kernels? Note that we need a grid-stride loop kernel because, we found by testing + * that it achieves peak effective bandwidth. + */ +template +void distribution_nullary_kernel(at::TensorIteratorBase& iter, + RNG gen, + const dist_t& dist_func, + const transform_t transform_func) { + static_assert(unroll_factor >= 1, "unroll_factor must be >= 1."); + int64_t numel = iter.numel(); + if (numel == 0) { + return; + } + + auto execution_policy = calc_execution_policy(numel); + auto counter_offset = std::get<0>(execution_policy); + auto grid = std::get<1>(execution_policy); + auto block = std::get<2>(execution_policy); + PhiloxCudaState rng_engine_inputs; + { + // See Note [Acquire lock when using random generators] + std::lock_guard lock(gen->mutex_); + rng_engine_inputs = gen->philox_cuda_state(counter_offset); + } + + if (!iter.can_use_32bit_indexing()) { + for (auto& sub_iter : iter.with_32bit_indexing()) { + distribution_nullary_kernel(sub_iter, + gen, dist_func, transform_func); + } + return; + } + + char* out_data = (char*)iter.data_ptr(0); + + auto stream = at::cuda::getCurrentCUDAStream(); + if (iter.is_trivial_1d()) { + auto strides = iter.get_inner_strides(); + int stride0 = strides[0]; + distribution_elementwise_grid_stride_kernel<<>>( + numel, + rng_engine_inputs, + dist_func, + [=]__device__(int idx, accscalar_t rand) { + scalar_t* out = (scalar_t*)&out_data[stride0 * idx]; + *out = transform_func(rand); + } + ); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } else { + auto offset_calc = make_offset_calculator<1>(iter); + distribution_elementwise_grid_stride_kernel<<>>( + numel, + rng_engine_inputs, + dist_func, + [=]__device__(int idx, accscalar_t rand) { + auto offsets = offset_calc.get(idx); + scalar_t* out = (scalar_t*)&out_data[offsets[0]]; + *out = transform_func(rand); + } + ); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } +} + +// Binary kernel +template +__global__ void distribution_binary_elementwise_kernel( + int numel, + func_t f, + PhiloxCudaState philox_args, + typename function_traits::result_type *output_data, + const typename function_traits::template arg<1>::type *input_data_1, + const typename function_traits::template arg<2>::type *input_data_2, + inp_offset_calc_t inp_calc, + out_offset_calc_t out_calc) { + auto seeds = at::cuda::philox::unpack(philox_args); + + using input_t_1 = typename function_traits::template arg<1>::type; + using input_t_2 = typename function_traits::template arg<2>::type; + + input_t_1 inputs_1[thread_work_size()]; + input_t_2 inputs_2[thread_work_size()]; + + int base_index = block_work_size() * blockIdx.x; + int remaining = std::min(numel - base_index, block_work_size()); + + curandStatePhilox4_32_10_t state; + curand_init(std::get<0>(seeds), + blockIdx.x * blockDim.x + threadIdx.x, + std::get<1>(seeds), + &state); + + // load data into registers + int thread_idx = threadIdx.x; + #pragma unroll + for (int i = 0; i < thread_work_size(); i++) { + if (thread_idx >= remaining) { + break; + } + int input_idx = thread_idx + base_index; + auto offsets = inp_calc.get(input_idx); + inputs_1[i] = input_data_1[offsets[0]]; + inputs_2[i] = input_data_2[offsets[1]]; + + thread_idx += num_threads(); + } + + // compute and store + thread_idx = threadIdx.x; + #pragma unroll + for (int i = 0; i < thread_work_size(); i++) { + if (thread_idx >= remaining) { + break; + } + int input_idx = thread_idx + base_index; + auto offsets = out_calc.get(input_idx); + output_data[offsets[0]] = f(state, inputs_1[i], inputs_2[i]); + thread_idx += num_threads(); + } +} + +template +void distribution_binary_kernel(TensorIteratorBase &iter, PhiloxCudaState philox_args, const func_t &f) { + static_assert(std::is_same::template arg<0>::type, curandStatePhilox4_32_10_t&>::value, "the first argument of functor must be curandStatePhilox4_32_10_t"); + using input_t_1 = typename function_traits::template arg<1>::type; + using input_t_2 = typename function_traits::template arg<2>::type; + using output_t = typename function_traits::result_type; + + if (!iter.can_use_32bit_indexing()) { + for (auto& sub_iter : iter.with_32bit_indexing()) { + distribution_binary_kernel(sub_iter, philox_args, f); + } + return; + } + + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(iter.can_use_32bit_indexing()); + + int64_t numel = iter.numel(); + if (numel == 0) { + return; + } + + output_t *output_data = static_cast(iter.data_ptr(0)); + const input_t_1 *input_data_1 = static_cast(iter.data_ptr(1)); + const input_t_2 *input_data_2 = static_cast(iter.data_ptr(2)); + + int64_t grid = (numel + block_work_size() - 1) / block_work_size(); + auto stream = at::cuda::getCurrentCUDAStream(); + + if (iter.is_contiguous()) { + distribution_binary_elementwise_kernel<<>>( + numel, f, philox_args, output_data, input_data_1, input_data_2, + TrivialOffsetCalculator<2>(), TrivialOffsetCalculator<1>()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } else { + distribution_binary_elementwise_kernel<<>>( + numel, f, philox_args, output_data, input_data_1, input_data_2, + make_input_offset_calculator<2>(iter), make_output_offset_calculator(iter)); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } +} + +} // namespace +}} // namespace at::native + + +namespace at { +namespace native { +namespace templates { +namespace cuda { + +// ==================================================== Random ======================================================== + +template +void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t base, RNG gen) { + AT_DISPATCH_V2(iter.dtype(), "random_from_to_kernel_cuda", AT_WRAP([&] { + if (( + std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value) && range >= 1ULL << 32) + { + // define lambda to mod with range and add base + auto random_func = [range, base] __device__ (uint64_t rand) { + return transformation::uniform_int_from_to(rand, range, base); + }; + distribution_nullary_kernel(iter, + gen, + [] __device__ (curandStatePhilox4_32_10_t* state) -> ulonglong2 { + ulonglong2 ret; + uint4 rand_val = curand4(state); + ret.x = (static_cast(rand_val.x) << 32) | rand_val.y; + ret.y = (static_cast(rand_val.z) << 32) | rand_val.w; + return ret; + }, + random_func); + } else { + auto random_func = [range, base] __device__ (uint32_t rand) { + return transformation::uniform_int_from_to(rand, range, base); + }; + distribution_nullary_kernel(iter, + gen, + [] __device__ (curandStatePhilox4_32_10_t* state) { + return curand4(state); + }, + random_func); + } + }), AT_EXPAND(AT_ALL_TYPES), kBool, kHalf, kBFloat16, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)); +} + +// This is the special kernel to handle single specific case: +// from(inclusive) = std::numeric_limits::lowest() +// to(exclusive) = None (= std::numeric_limits::max() + 1) +template +void random_full_64_bits_range_kernel(TensorIteratorBase& iter, RNG gen) { + AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::BFloat16, iter.dtype(), "random_full_64_bits_range_kernel_cuda", [&] { + if (std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value) { + auto random_func = [] __device__ (uint64_t rand) { + return transformation::uniform_int_full_range(rand); + }; + distribution_nullary_kernel(iter, + gen, + [] __device__ (curandStatePhilox4_32_10_t* state) -> ulonglong2 { + ulonglong2 ret; + uint4 rand_val = curand4(state); + ret.x = (static_cast(rand_val.x) << 32) | rand_val.y; + ret.y = (static_cast(rand_val.z) << 32) | rand_val.w; + return ret; + }, + random_func); + } else { + TORCH_CHECK(false, "random_full_64_bits_range_kernel_cuda handles only int64, double, float and bfloat16"); + } + }); +} + +template +struct RandomFromToKernel { + void operator()(TensorIteratorBase& iter, uint64_t range, int64_t base, c10::optional gen) { + random_from_to_kernel(iter, range, base, check_generator(gen)); + } + void operator()(TensorIteratorBase& iter, c10::optional gen) { + random_full_64_bits_range_kernel(iter, check_generator(gen)); + } +}; + +template +void random_kernel(TensorIteratorBase& iter, RNG gen) { + AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool, iter.dtype(), "random_kernel_cuda", [&] { + if (std::is_same::value || std::is_same::value) { + auto random_func = [] __device__ (uint64_t rand) { + return transformation::uniform_int(rand); + }; + distribution_nullary_kernel(iter, gen, + [] __device__ (curandStatePhilox4_32_10_t* state) -> ulonglong2 { + ulonglong2 ret; + uint4 rand_val = curand4(state); + ret.x = (static_cast(rand_val.x) << 32) | rand_val.y; + ret.y = (static_cast(rand_val.z) << 32) | rand_val.w; + return ret; + }, + random_func); + } else { + auto random_func = [] __device__ (uint32_t rand) { + return transformation::uniform_int(rand); + }; + distribution_nullary_kernel(iter, + gen, + [] __device__ (curandStatePhilox4_32_10_t* state) { + return curand4(state); + }, + random_func); + } + }); +} + +template +struct RandomKernel { + void operator()(TensorIteratorBase& iter, RNG gen) { + random_kernel(iter, gen); + } +}; + +// ==================================================================================================================== + +template +void uniform_and_transform(TensorIteratorBase& iter, RNG gen, transform_t transform) { + if (std::is_same::value) { + distribution_nullary_kernel(iter, + gen, + [] __device__ (curandStatePhilox4_32_10_t* state) { return curand_uniform2_double(state); }, + transform); + } else { + distribution_nullary_kernel(iter, + gen, + [] __device__ (curandStatePhilox4_32_10_t* state) { return curand_uniform4(state); }, + transform); + } +} + +template +void normal_and_transform(TensorIteratorBase& iter, RNG gen, transform_t transform) { + if (std::is_same::value) { + distribution_nullary_kernel(iter, + gen, + [] __device__ (curandStatePhilox4_32_10_t* state) { return curand_normal2_double(state); }, + transform); + } else { + distribution_nullary_kernel(iter, + gen, + [] __device__ (curandStatePhilox4_32_10_t* state) { return curand_normal4(state); }, + transform); + } +} + +// ==================================================== Normal ======================================================== + +template +void normal_kernel(const TensorBase &self, double mean_, double std_, RNG gen) { + auto iter = TensorIterator::borrowing_nullary_op(self); + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "normal_kernel_cuda", [&] { + using accscalar_t = at::acc_type; + auto mean = static_cast(mean_); + auto std = static_cast(std_); + // define lambda to multiply std and add mean + auto normal_func = [mean, std] __device__ (accscalar_t rand) { + return static_cast(transformation::normal(rand, mean, std)); + }; + normal_and_transform(iter, gen, normal_func); + }); +} + +template +struct NormalKernel { + void operator()(const TensorBase &self, double mean, double std, c10::optional gen) { + normal_kernel(self, mean, std, check_generator(gen)); + } +}; + +// ==================================================== Uniform ======================================================== + +template +void uniform_kernel(TensorIteratorBase& iter, double from_, double to_, RNG gen) { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "uniform_kernel_cuda", [&] { + auto from = static_cast(from_); + auto to = static_cast(to_); + using opmath_t = at::opmath_type; + auto range = static_cast(to-from); + // define lambda to reverse bounds, multiply 'range' and add 'from_' + auto uniform_func = [range, from, to] __device__ (opmath_t rand) { + // Compute output value before reversing the bounds + // BEFORE TOUCHING THIS CODE READ: https://github.com/pytorch/pytorch/issues/96947 + auto value = static_cast(rand * range + from); + // reverse the bounds of curand4 from (0, 1] to [0, 1) + // Note that this method is from legacy THCTensorRandom and is likely to give + // you more 0-s, since, the probability of gettings 1-s is higher than 0-s and + // by reversing the bounds, we are flipping the probabilities of 1-s and 0-s. + // BEFORE TOUCHING THIS CODE READ: https://github.com/pytorch/pytorch/issues/16706 + auto reverse_bound_value = value == to ? from : value; + return reverse_bound_value; + }; + uniform_and_transform(iter, gen, uniform_func); + }); +} + +template +struct UniformKernel { + void operator()(TensorIteratorBase& iter, double from, double to, c10::optional gen) { + uniform_kernel(iter, from, to, check_generator(gen)); + } +}; + +// ================================================== LogNormal ======================================================= + +template +void log_normal_kernel(TensorIteratorBase& iter, double mean_, double std_, RNG gen) { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "log_normal_cuda", [&] { + using accscalar_t = at::acc_type; + auto mean = static_cast(mean_); + auto std = static_cast(std_); + // define lambda for log_normal transformation + auto log_normal_func = [mean, std] __device__ (accscalar_t rand) { + return static_cast(transformation::log_normal(transformation::normal(rand, mean, std))); + }; + normal_and_transform(iter, gen, log_normal_func); + }); +} + +template +struct LogNormalKernel { + void operator()(TensorIteratorBase& iter, double mean, double std, c10::optional gen) { + log_normal_kernel(iter, mean, std, check_generator(gen)); + } +}; + +// =================================================== Geometric ====================================================== + +template +void geometric_kernel(TensorIteratorBase& iter, double p, RNG gen) { + AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "geometric_cuda", [&] { + using accscalar_t = at::DiscreteDistributionType::type; + // define lambda for geometric transformation + auto geometric_func = [p] __device__ (accscalar_t rand) { + return static_cast(transformation::geometric(rand, p)); + }; + uniform_and_transform(iter, gen, geometric_func); + }); +} + +template +struct GeometricKernel { + void operator()(TensorIteratorBase& iter, double p, c10::optional gen) { + geometric_kernel(iter, p, check_generator(gen)); + } +}; + +// ================================================== Exponential ===================================================== + +template +void exponential_kernel(TensorIteratorBase& iter, double lambda_, RNG gen) { + TORCH_CHECK(isFloatingType(iter.dtype()), "Exponential distribution is a continuous probability distribution. dtype must be a floating point but you specified ", iter.dtype()); + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "exponential_cuda", [&] { + using accscalar_t = at::acc_type; + auto lambda = static_cast(lambda_); + // define lambda for exponential transformation + auto exponential_func = [lambda] __device__ (accscalar_t rand) { + return static_cast(transformation::exponential(rand, lambda)); + }; + uniform_and_transform(iter, gen, exponential_func); + }); +} + +template +struct ExponentialKernel { + void operator()(TensorIteratorBase& iter, double lambda, c10::optional gen) { + exponential_kernel(iter, lambda, check_generator(gen)); + } +}; + +// ==================================================== Cauchy ======================================================== + +template +void cauchy_kernel(TensorIteratorBase& iter, double median_, double sigma_, RNG gen) { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "cauchy_cuda", [&] { + using accscalar_t = at::acc_type; + auto median = static_cast(median_); + auto sigma = static_cast(sigma_); + // define lambda for cauchy transformation + auto cauchy_func = [median, sigma] __device__ (accscalar_t rand) { + return static_cast(transformation::cauchy(rand, median, sigma)); + }; + uniform_and_transform(iter, gen, cauchy_func); + }); +} + +template +struct CauchyKernel { + void operator()(TensorIteratorBase& iter, double median, double sigma, c10::optional gen) { + cauchy_kernel(iter, median, sigma, check_generator(gen)); + } +}; + +// ==================================================== Bernoulli ===================================================== + +template +void bernoulli_tensor_cuda_kernel( + const TensorBase &ret, const at::TensorBase &p, + PhiloxCudaState philox_args) { + auto functor = [philox_args] __device__( + int n, scalar_t& v1, scalar_t& v2, scalar_t& v3, scalar_t& v4, + const prob_t& p1, const prob_t& p2, const prob_t& p3, const prob_t& p4) { + auto seeds = at::cuda::philox::unpack(philox_args); + curandStatePhilox4_32_10_t state; + curand_init(std::get<0>(seeds), + blockIdx.x * blockDim.x + threadIdx.x, + std::get<1>(seeds), + &state); + + // See Note [Register spilling in curand call for CUDA < 10] + float4 rand = curand_uniform4(&state); + switch (n) { + case 4: { + CUDA_KERNEL_ASSERT(0 <= p4 && p4 <= 1); + v4 = static_cast(rand.w <= p4); + // fallthrough + } + case 3: { + CUDA_KERNEL_ASSERT(0 <= p3 && p3 <= 1); + v3 = static_cast(rand.z <= p3); + // fallthrough + } + case 2: { + CUDA_KERNEL_ASSERT(0 <= p2 && p2 <= 1); + v2 = static_cast(rand.y <= p2); + // fallthrough + } + case 1: { + CUDA_KERNEL_ASSERT(0 <= p1 && p1 <= 1); + v1 = static_cast(rand.x <= p1); + } + } + }; + // The template argument `4` below indicates that we want to operate on four + // element at each time. See NOTE [ CUDA_tensor_applyN helpers ] for details. + at::cuda::CUDA_tensor_apply2(ret, p, functor); +} + +template +void bernoulli_kernel(const TensorBase &self, const TensorBase &p_, RNG gen) { + PhiloxCudaState rng_engine_inputs; + { + // See Note [Acquire lock when using random generators] + std::lock_guard lock(gen->mutex_); + rng_engine_inputs = gen->philox_cuda_state(10); + } + TORCH_CHECK(at::isFloatingType(p_.scalar_type()), "expected probabilities tensor to have floating type, got ", p_.scalar_type()); + // cast probabilities tensor to double for double `self` tensor, and to `float` for everything else + const auto p_type = self.dtype() == at::kDouble ? at::kDouble : at::kFloat; + auto p_cuda = p_.to(TensorOptions().device(self.device()).dtype(p_type)); + auto p = expand_inplace(self, p_cuda); + AT_DISPATCH_ALL_TYPES_AND3( + at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool, self.scalar_type(), "bernoulli_tensor_cuda_self_", [&] { + if (std::is_same::value) { + return bernoulli_tensor_cuda_kernel(self, *p, rng_engine_inputs); + } else { + return bernoulli_tensor_cuda_kernel(self, *p, rng_engine_inputs); + } + }); +} + +template +void bernoulli_kernel(TensorIteratorBase& iter, double p, RNG gen) { + AT_DISPATCH_ALL_TYPES_AND3( + at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool, iter.dtype(), "bernoulli_scalar_cuda_", [&] { + using accscalar_t = at::DiscreteDistributionType::type; + // define lambda for bernoulli transformation + auto bernoulli_func = [p] __device__ (accscalar_t rand) { + return static_cast(transformation::bernoulli(rand, p)); + }; + uniform_and_transform(iter, gen, bernoulli_func); + }); +} + +template +struct BernoulliKernel { + void operator()(TensorIteratorBase& iter, double p, c10::optional gen) { + bernoulli_kernel(iter, p, check_generator(gen)); + } + void operator()(const TensorBase &self, const TensorBase &p_, c10::optional gen) { + bernoulli_kernel(self, p_, check_generator(gen)); + } +}; + +}}}} diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Distributions.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Distributions.h new file mode 100644 index 0000000000000000000000000000000000000000..1a34fdfdf31494faab439544578be8aaf950dc32 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Distributions.h @@ -0,0 +1,25 @@ +#pragma once + +namespace at { +struct CUDAGeneratorImpl; +struct TensorIteratorBase; +class TensorBase; + +namespace native { + +void launch_poisson_cuda_kernel( + const TensorBase &ret, const TensorBase &lambda, CUDAGeneratorImpl *gen); + +void launch_gamma_kernel( + const TensorBase &ret, const TensorBase &alpha, CUDAGeneratorImpl *gen); + +void launch_binomial_cuda_kernel( + TensorIteratorBase &iter, CUDAGeneratorImpl *gen); + +void launch_dirichlet_kernel(TensorIteratorBase &iter); + +void launch_standard_gamma_grad_kernel(TensorIteratorBase &iter); + +void launch_dirichlet_grad_kernel(TensorIteratorBase &iter); + +}} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/EmbeddingBackwardKernel.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/EmbeddingBackwardKernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..0d8d45c1defb90af4da7d2c39d914d3d88ddafc3 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/EmbeddingBackwardKernel.cuh @@ -0,0 +1,22 @@ +#pragma once +#include +#include +#include +#include + +namespace at { +namespace native { + +Tensor embedding_backward_cuda_kernel( + const Tensor &grad, + const Tensor &orig_indices, + const Tensor &sorted_indices, + const Tensor &count, + int64_t num_weights, + int padding_idx = -1, + bool mode_mean = false, + const Tensor &offset2bag = Tensor(), + const Tensor &bag_size = Tensor(), + const Tensor &per_sample_weights = Tensor()); + +}} diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/ForeachFunctors.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/ForeachFunctors.cuh new file mode 100644 index 0000000000000000000000000000000000000000..55e4fd7a598907f452d033f73816c16b7c6e22b8 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/ForeachFunctors.cuh @@ -0,0 +1,681 @@ +#pragma once +#include +#include +#include +#include + +namespace at::native { + +namespace { + +// TODO(crcrpar): Handle version bump in codegen. +// rel: +// https://github.com/pytorch/pytorch/blob/9cf84347767c8abb8feba18a9a1baba321eeb8b9/tools/autograd/gen_inplace_or_view_type.py#L481-L482 +inline void increment_version(TensorList tensors) { + for (const auto& t : tensors) { + t.unsafeGetTensorImpl()->bump_version(); + } +} + +// Initializes args and checks if all args are aligned +template +__device__ bool init_args( + T** args, + TensorListMetadata& tl, + const int64_t chunk_idx, + const int64_t chunk_size, + const int64_t tensor_loc) { + bool all_aligned = true; + for (int i = 0; i < depth; i++) { + args[i] = (T*)tl.addresses[i][tensor_loc]; + args[i] += chunk_idx * chunk_size; + + if (!is_aligned(args[i])) { + all_aligned = false; + } + } + return all_aligned; +} + +// Initializes args and checks if all args are aligned +template +__device__ bool init_args( + T** args, + TensorListScalarListMetadata& tl, + const int64_t chunk_idx, + const int64_t chunk_size, + const int64_t tensor_loc) { + bool all_aligned = true; + for (int i = 0; i < depth; i++) { + args[i] = (T*)tl.addresses[i][tensor_loc]; + args[i] += chunk_idx * chunk_size; + + if (!is_aligned(args[i])) { + all_aligned = false; + } + } + return all_aligned; +} + +template +__device__ bool init_args( + T** args, + FusedOptimizerTensorListMetadata& tl, + const int64_t chunk_idx, + const int64_t chunk_size, + const int64_t tensor_loc) { + bool all_aligned = true; + for (int i = 0; i < depth; i++) { + args[i] = (T*)tl.addresses[i][tensor_loc]; + args[i] += chunk_idx * chunk_size; + + if (!is_aligned(args[i])) { + all_aligned = false; + } + } + return all_aligned; +} + +template +__device__ void load_args( + T r_args[][kILP], + T** args, + const int64_t i_start, + const int64_t chunk_size, + const int64_t n) { +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + const auto i = i_start + threadIdx.x + ii * blockDim.x; + for (int r_index = 0; r_index < depth; r_index++) { + r_args[r_index][ii] = 0; + if (i < n && i < chunk_size) { + r_args[r_index][ii] = args[r_index][i]; + } + } + } +} + +template +__device__ void store_args( + T* dst, + T* src, + const int64_t i_start, + const int64_t chunk_size, + const int64_t n) { +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + const int64_t i = i_start + threadIdx.x + ii * blockDim.x; + if (i < n && i < chunk_size) + dst[i] = src[ii]; + } +} + +template +__device__ __forceinline__ void binary_op_scalar( + T r_args[][kILP], + T** args, + opmath_t scalar, + const int64_t n, + const int64_t chunk_size, + const bool all_aligned, + Op op) { + // to make things simple, we put aligned case in a different code path + if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) { + for (int64_t i_start = threadIdx.x; + i_start * kILP < n && i_start * kILP < chunk_size; + i_start += blockDim.x) { + // load + load_store(r_args[0], args[0], 0, i_start); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = static_cast( + op(static_cast(r_args[0][ii]), + static_cast(scalar))); + } + // store + load_store(args[res_arg_index], r_args[0], i_start, 0); + } + } else { + for (int64_t i_start = 0; i_start < n && i_start < chunk_size; + i_start += blockDim.x * kILP) { + // Regardless if depth is 1 (for inplace) or 2 (for out of place), r_args + // has depth 1 + load_args<1>(r_args, args, i_start, chunk_size, n); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = static_cast( + op(static_cast(r_args[0][ii]), + static_cast(scalar))); + } + store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n); + } + } +} + +template +__device__ __forceinline__ void pointwise_op_scalar( + T r_args[][kILP], + T** args, + opmath_t scalar, + const int64_t n, + const int64_t chunk_size, + const bool all_aligned, + Op op) { + // to make things simple, we put aligned case in a different code path + if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) { + for (int64_t i_start = threadIdx.x; + i_start * kILP < n && i_start * kILP < chunk_size; + i_start += blockDim.x) { + // load + load_store(r_args[0], args[0], 0, i_start); + load_store(r_args[1], args[1], 0, i_start); + load_store(r_args[2], args[2], 0, i_start); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = static_cast( + static_cast(r_args[0][ii]) + + scalar * + op(static_cast(r_args[1][ii]), + static_cast(r_args[2][ii]))); + } + // store + load_store(args[res_arg_index], r_args[0], i_start, 0); + } + } else { + for (int64_t i_start = 0; i_start < n && i_start < chunk_size; + i_start += blockDim.x * kILP) { + // Regardless if depth is 3 (for inplace) or 4 (for out of place), r_args + // has depth 3 + load_args<3>(r_args, args, i_start, chunk_size, n); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = static_cast( + static_cast(r_args[0][ii]) + + scalar * + op(static_cast(r_args[1][ii]), + static_cast(r_args[2][ii]))); + } + store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n); + } + } +} + +// +// Binary Functors +// +template +struct BinaryOpScalarFunctor { + using opmath_t = at::opmath_type; + template + __device__ __forceinline__ void operator()( + int chunk_size, + TensorListMetadata& tl, + Op op, + opmath_t scalar) { + const int tensor_loc = tl.block_to_tensor[blockIdx.x]; + const int chunk_idx = tl.block_to_chunk[blockIdx.x]; + auto n = tl.numel_for_tensor[tensor_loc]; + + T* args[depth]; + const bool all_aligned = + init_args(args, tl, chunk_idx, chunk_size, tensor_loc); + n -= chunk_idx * chunk_size; + T r_args[r_args_depth][kILP]; + + binary_op_scalar( + r_args, args, scalar, n, chunk_size, all_aligned, op); + } +}; + +template +struct BinaryOpScalarListFunctor { + using opmath_t = at::opmath_type; + template + __device__ __forceinline__ void operator()( + int chunk_size, + TensorListScalarListMetadata& tl, + Op op) { + const auto tensor_loc = tl.block_to_tensor[blockIdx.x]; + const auto chunk_idx = tl.block_to_chunk[blockIdx.x]; + auto n = tl.numel_for_tensor[tensor_loc]; + + T* args[depth]; + const bool all_aligned = + init_args(args, tl, chunk_idx, chunk_size, tensor_loc); + opmath_t scalar = tl.scalar_vals[tensor_loc]; + n -= chunk_idx * chunk_size; + T r_args[r_args_depth][kILP]; + + binary_op_scalar( + r_args, args, scalar, n, chunk_size, all_aligned, op); + } +}; + +template +struct BinaryOpListAlphaFunctor { + using opmath_t = at::opmath_type; + template + __device__ __forceinline__ void operator()( + int chunk_size, + TensorListMetadata& tl, + Op op, + opmath_t alpha) { + const auto tensor_loc = tl.block_to_tensor[blockIdx.x]; + const auto chunk_idx = tl.block_to_chunk[blockIdx.x]; + auto n = tl.numel_for_tensor[tensor_loc]; + + T* args[depth]; + const bool all_aligned = + init_args(args, tl, chunk_idx, chunk_size, tensor_loc); + n -= chunk_idx * chunk_size; + T r_args[r_args_depth][kILP]; + + // to make things simple, we put aligned case in a different code path + if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) { + for (int64_t i_start = threadIdx.x; + i_start * kILP < n && i_start * kILP < chunk_size; + i_start += blockDim.x) { + // load + load_store(r_args[0], args[0], 0, i_start); + load_store(r_args[1], args[1], 0, i_start); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = static_cast( + op(static_cast(r_args[0][ii]), + alpha * static_cast(r_args[1][ii]))); + } + // store + load_store(args[res_arg_index], r_args[0], i_start, 0); + } + } else { + for (int64_t i_start = 0; i_start < n && i_start < chunk_size; + i_start += blockDim.x * kILP) { + load_args(r_args, args, i_start, chunk_size, n); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = static_cast( + op(static_cast(r_args[0][ii]), + alpha * static_cast(r_args[1][ii]))); + } + store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n); + } + } + } +}; + +template +struct BinaryOpScalarTensorFunctor { + using opmath_t = at::opmath_type; + template + __device__ __forceinline__ void operator()( + int chunk_size, + TensorListMetadata& tl, + Op op, + T* scalar, + opmath_t alpha) { + const int tensor_loc = tl.block_to_tensor[blockIdx.x]; + const int chunk_idx = tl.block_to_chunk[blockIdx.x]; + auto n = tl.numel_for_tensor[tensor_loc]; + + T* args[depth]; + const bool all_aligned = + init_args(args, tl, chunk_idx, chunk_size, tensor_loc); + n -= chunk_idx * chunk_size; + T r_args[r_args_depth][kILP]; + + // to make things simple, we put aligned case in a different code path + if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) { + for (int64_t i_start = threadIdx.x; + i_start * kILP < n && i_start * kILP < chunk_size; + i_start += blockDim.x) { + // load + load_store(r_args[0], args[0], 0, i_start); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = static_cast(op( + static_cast(r_args[0][ii]), + static_cast(alpha) * static_cast(*scalar))); + } + // store + load_store(args[res_arg_index], r_args[0], i_start, 0); + } + } else { + for (int64_t i_start = 0; i_start < n && i_start < chunk_size; + i_start += blockDim.x * kILP) { + // Regardless if depth is 1 (for inplace) or 2 (for out of place), + // r_args has depth 1 + load_args<1>(r_args, args, i_start, chunk_size, n); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = static_cast(op( + static_cast(r_args[0][ii]), + static_cast(alpha) * static_cast(*scalar))); + } + store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n); + } + } + } +}; + +// +// Unary Functors +// + +template +struct ZeroFunctor { + __device__ __forceinline__ void operator()( + int chunk_size, + TensorListMetadata<1>& tl) { + const auto tensor_loc = tl.block_to_tensor[blockIdx.x]; + const auto chunk_idx = tl.block_to_chunk[blockIdx.x]; + auto n = tl.numel_for_tensor[tensor_loc]; + + T* args[depth]; + const auto all_aligned = + init_args(args, tl, chunk_idx, chunk_size, tensor_loc); + n -= chunk_idx * chunk_size; + T r_args[r_args_depth][kILP]; + + // to make things simple, we put aligned case in a different code path + if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) { + for (int64_t i_start = threadIdx.x; + i_start * kILP < n && i_start * kILP < chunk_size; + i_start += blockDim.x) { +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = 0; + } + // store + load_store(args[0], r_args[0], i_start, 0); + } + } else { + for (int64_t i_start = 0; i_start < n && i_start < chunk_size; + i_start += blockDim.x * kILP) { +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = 0; + } + store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n); + } + } + } +}; + +template +struct UnaryOpFunctor { + using opmath_t = at::opmath_type; + template + __device__ __forceinline__ void operator()( + int chunk_size, + TensorListMetadata& tl, + Op op) { + const auto tensor_loc = tl.block_to_tensor[blockIdx.x]; + const auto chunk_idx = tl.block_to_chunk[blockIdx.x]; + auto n = tl.numel_for_tensor[tensor_loc]; + + T* args[depth]; + bool all_aligned = + init_args(args, tl, chunk_idx, chunk_size, tensor_loc); + n -= chunk_idx * chunk_size; + T r_args[r_args_depth][kILP]; + + // to make things simple, we put aligned case in a different code path + if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) { + for (int64_t i_start = threadIdx.x; + i_start * kILP < n && i_start * kILP < chunk_size; + i_start += blockDim.x) { + // load + load_store(r_args[0], args[0], 0, i_start); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = + static_cast(op(static_cast(r_args[0][ii]))); + } + // store + load_store(args[res_arg_index], r_args[0], i_start, 0); + } + } else { + for (int64_t i_start = 0; i_start < n && i_start < chunk_size; + i_start += blockDim.x * kILP) { + load_args(r_args, args, i_start, chunk_size, n); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = + static_cast(op(static_cast(r_args[0][ii]))); + } + store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n); + } + } + } +}; + +// +// Pointwise Functors +// + +template +struct PointwiseOpScalarFunctor { + using opmath_t = at::opmath_type; + template + __device__ __forceinline__ void operator()( + int chunk_size, + TensorListMetadata& tl, + Op op, + opmath_t scalar) { + const auto tensor_loc = tl.block_to_tensor[blockIdx.x]; + const auto chunk_idx = tl.block_to_chunk[blockIdx.x]; + auto n = tl.numel_for_tensor[tensor_loc]; + + T* args[depth]; + const bool all_aligned = + init_args(args, tl, chunk_idx, chunk_size, tensor_loc); + n -= chunk_idx * chunk_size; + T r_args[r_args_depth][kILP]; + + pointwise_op_scalar( + r_args, args, scalar, n, chunk_size, all_aligned, op); + } +}; + +template +struct PointwiseOpScalarListFunctor { + using opmath_t = at::opmath_type; + template + __device__ __forceinline__ void operator()( + int chunk_size, + TensorListScalarListMetadata& tl, + Op op) { + const auto tensor_loc = tl.block_to_tensor[blockIdx.x]; + const auto chunk_idx = tl.block_to_chunk[blockIdx.x]; + auto n = tl.numel_for_tensor[tensor_loc]; + + T* args[depth]; + const bool all_aligned = + init_args(args, tl, chunk_idx, chunk_size, tensor_loc); + opmath_t scalar = tl.scalar_vals[tensor_loc]; + n -= chunk_idx * chunk_size; + T r_args[r_args_depth][kILP]; + + pointwise_op_scalar( + r_args, args, scalar, n, chunk_size, all_aligned, op); + } +}; + +template +struct PointwiseOpListFunctor { + using opmath_t = at::opmath_type; + template + __device__ __forceinline__ void operator()( + int chunk_size, + TensorListMetadata& tl, + Op op) { + const auto tensor_loc = tl.block_to_tensor[blockIdx.x]; + const auto chunk_idx = tl.block_to_chunk[blockIdx.x]; + auto n = tl.numel_for_tensor[tensor_loc]; + + T* args[depth]; + const bool all_aligned = + init_args(args, tl, chunk_idx, chunk_size, tensor_loc); + n -= chunk_idx * chunk_size; + T r_args[depth - 1][kILP]; + + // to make things simple, we put aligned case in a different code path + if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) { + for (int64_t i_start = threadIdx.x; + i_start * kILP < n && i_start * kILP < chunk_size; + i_start += blockDim.x) { + // load + load_store(r_args[0], args[0], 0, i_start); + load_store(r_args[1], args[1], 0, i_start); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = static_cast( + op(static_cast(r_args[0][ii]), + static_cast(r_args[1][ii]))); + } + // store + load_store(args[2], r_args[0], i_start, 0); + } + } else { + for (int64_t i_start = 0; i_start < n && i_start < chunk_size; + i_start += blockDim.x * kILP) { + load_args(r_args, args, i_start, chunk_size, n); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = static_cast( + op(static_cast(r_args[0][ii]), + static_cast(r_args[1][ii]))); + } + store_args(args[2], r_args[0], i_start, chunk_size, n); + } + } + } +}; + +template +struct TernaryOpListFunctor { + using opmath_t = at::opmath_type; + template + __device__ __forceinline__ void operator()( + int chunk_size, + TensorListMetadata& tl, + Op op) { + static_assert(depth == 3 || depth == 4, ""); + static_assert(depth >= r_args_depth, ""); + static_assert(res_arg_index == depth - 1 || res_arg_index == 0, ""); + const auto tensor_loc = tl.block_to_tensor[blockIdx.x]; + const auto chunk_idx = tl.block_to_chunk[blockIdx.x]; + auto n = tl.numel_for_tensor[tensor_loc]; + + T* args[depth]; + const bool all_aligned = + init_args(args, tl, chunk_idx, chunk_size, tensor_loc); + n -= chunk_idx * chunk_size; + T r_args[r_args_depth][kILP]; + + if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) { + for (int64_t i_start = threadIdx.x; + i_start * kILP < n && i_start * kILP < chunk_size; + i_start += blockDim.x) { + load_store(r_args[0], args[0], 0, i_start); + load_store(r_args[1], args[1], 0, i_start); + load_store(r_args[2], args[2], 0, i_start); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = + op(static_cast(r_args[0][ii]), + static_cast(r_args[1][ii]), + static_cast(r_args[2][ii])); + } + load_store(args[res_arg_index], r_args[0], i_start, 0); + } + } else { + for (int64_t i_start = 0; i_start < n && i_start < chunk_size; + i_start += blockDim.x * kILP) { + load_args(r_args, args, i_start, chunk_size, n); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = + op(static_cast(r_args[0][ii]), + static_cast(r_args[1][ii]), + static_cast(r_args[2][ii])); + } + store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n); + } + } + } +}; + +template +struct TernaryOpScalarFunctor { + using opmath_t = at::opmath_type; + template + __device__ __forceinline__ void operator()( + int chunk_size, + TensorListMetadata& tl, + Op op, + opmath_t alpha) { + static_assert(depth == 2 || depth == 3, ""); + static_assert(depth >= r_args_depth, ""); + static_assert(res_arg_index == depth - 1 || res_arg_index == 0, ""); + const auto tensor_loc = tl.block_to_tensor[blockIdx.x]; + const auto chunk_idx = tl.block_to_chunk[blockIdx.x]; + auto n = tl.numel_for_tensor[tensor_loc]; + + T* args[depth]; + const bool all_aligned = + init_args(args, tl, chunk_idx, chunk_size, tensor_loc); + n -= chunk_idx * chunk_size; + T r_args[r_args_depth][kILP]; + + // to make things simple, we put aligned case in a different code path + if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) { + for (int64_t i_start = threadIdx.x; + i_start * kILP < n && i_start * kILP < chunk_size; + i_start += blockDim.x) { + // load + load_store(r_args[0], args[0], 0, i_start); + load_store(r_args[1], args[1], 0, i_start); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = + op(static_cast(r_args[0][ii]), + static_cast(r_args[1][ii]), + alpha); + } + // store + load_store(args[res_arg_index], r_args[0], i_start, 0); + } + } else { + for (int64_t i_start = 0; i_start < n && i_start < chunk_size; + i_start += blockDim.x * kILP) { + load_args(r_args, args, i_start, chunk_size, n); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = + op(static_cast(r_args[0][ii]), + static_cast(r_args[1][ii]), + alpha); + } + store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n); + } + } + } +}; + +template +struct power_functor { + C10_DEVICE T operator()(const T& a, const T& b) const { + return at::native::pow_(a, b); + } +}; + +template +struct reverse_power_functor { + C10_DEVICE T operator()(const T& a, const T& b) const { + return at::native::pow_(b, a); + } +}; + +} // namespace +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/ForeachMinMaxFunctors.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/ForeachMinMaxFunctors.cuh new file mode 100644 index 0000000000000000000000000000000000000000..9b08911b1d9500f200d55e031ffff7658d5491f0 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/ForeachMinMaxFunctors.cuh @@ -0,0 +1,22 @@ +#pragma once + +#include + +namespace at::native { + +// std:: does not have clamp functors +template +struct minimum { + __device__ T operator()(const T& a, const T& b) const { + return (_isnan(a) || a < b) ? a : b; + } +}; + +template +struct maximum { + __device__ T operator()(const T& a, const T& b) const { + return (_isnan(a) || a > b) ? a : b; + } +}; + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/GridSampler.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/GridSampler.cuh new file mode 100644 index 0000000000000000000000000000000000000000..a0e3b16c3a43ac28fc9091c80f0b2650526a551d --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/GridSampler.cuh @@ -0,0 +1,321 @@ +#pragma once +#include +#include + +namespace at { namespace native { + +using detail::GridSamplerInterpolation; +using detail::GridSamplerPadding; + +// Unnormalizes a coordinate from the -1 to +1 scale to its pixel index value, +// where we view each pixel as an area between (idx - 0.5) and (idx + 0.5). +// if align_corners: -1 and +1 get sent to the centers of the corner pixels +// -1 --> 0 +// +1 --> (size - 1) +// scale_factor = (size - 1) / 2 +// if not align_corners: -1 and +1 get sent to the image edges +// -1 --> -0.5 +// +1 --> (size - 1) + 0.5 == size - 0.5 +// scale_factor = size / 2 +template +static __forceinline__ __device__ +scalar_t grid_sampler_unnormalize(scalar_t coord, int size, bool align_corners) { + if (align_corners) { + // unnormalize coord from [-1, 1] to [0, size - 1] + return ((coord + 1.f) / 2) * (size - 1); + } else { + // unnormalize coord from [-1, 1] to [-0.5, size - 0.5] + return ((coord + 1.f) * size - 1) / 2; + } +} + +// grid_sampler_unnormalize_set_grad works the same as grid_sampler_unnormalize +// except that it also returns the `d output / d input` via pointer argument +// `grad_in`. +// This is useful in the backward pass of grid_sampler. +template +static __forceinline__ __device__ +scalar_t grid_sampler_unnormalize_set_grad(scalar_t coord, int size, + bool align_corners, scalar_t *grad_in) { + if (align_corners) { + // unnormalize coord from [-1, 1] to [0, size - 1] + *grad_in = static_cast(size - 1) / 2; + return ((coord + 1.f) / 2) * (size - 1); + } else { + // unnormalize coord from [-1, 1] to [-0.5, size - 0.5] + *grad_in = static_cast(size) / 2; + return ((coord + 1.f) * size - 1) / 2; + } +} + +// Clips coordinates to between 0 and clip_limit - 1 +template +static __forceinline__ __device__ +scalar_t clip_coordinates(scalar_t in, int clip_limit) { + return ::min(static_cast(clip_limit - 1), ::max(in, static_cast(0))); +} + +// clip_coordinates_set_grad works similarly to clip_coordinates except that +// it also returns the `d output / d input` via pointer argument `grad_in`. +// This is useful in the backward pass of grid_sampler. +template +static __forceinline__ __device__ +scalar_t clip_coordinates_set_grad(scalar_t in, int clip_limit, scalar_t *grad_in) { + // Note that it is important for the gradient calculation that borders + // are considered out of bounds. + if (in <= static_cast(0)) { + *grad_in = static_cast(0); + return static_cast(0); + } else { + scalar_t max = static_cast(clip_limit - 1); + if (in >= max) { + *grad_in = static_cast(0); + return max; + } else { + *grad_in = static_cast(1); + return in; + } + } +} + +// Reflects coordinates until they fall between low and high (inclusive). +// The bounds are passed as twice their value so that half-integer values +// can be represented as ints. +template +static __forceinline__ __device__ +scalar_t reflect_coordinates(scalar_t in, int twice_low, int twice_high) { + if (twice_low == twice_high) { + return static_cast(0); + } + scalar_t min = static_cast(twice_low) / 2; + scalar_t span = static_cast(twice_high - twice_low) / 2; + in = ::fabs(in - min); + // `fmod` returns same sign as `in`, which is positive after the `fabs` above. + scalar_t extra = ::fmod(in, span); + int flips = static_cast(::floor(in / span)); + if (flips % 2 == 0) { + return extra + min; + } else { + return span - extra + min; + } +} + +// reflect_coordinates_set_grad works similarly to reflect_coordinates except +// that it also returns the `d output / d input` via pointer argument +// `grad_in`. +// This is useful in the backward pass of grid_sampler. +template +static __forceinline__ __device__ +scalar_t reflect_coordinates_set_grad(scalar_t in, int twice_low, int twice_high, + scalar_t *grad_in) { + if (twice_low == twice_high) { + *grad_in = static_cast(0); + return static_cast(0); + } + int grad_in_mult_; + scalar_t min = static_cast(twice_low) / 2; + scalar_t span = static_cast(twice_high - twice_low) / 2; + in = in - min; + if (in < static_cast(0)) { + grad_in_mult_ = -1; + in = -in; + } else { + grad_in_mult_ = 1; + } + // `fmod` returns same sign as `in`, which is positive after the `if` above. + scalar_t extra = ::fmod(in, span); + int flips = static_cast(::floor(in / span)); + if (flips % 2 == 0) { + *grad_in = static_cast(grad_in_mult_); + return extra + min; + } else { + *grad_in = static_cast(-grad_in_mult_); + return span - extra + min; + } +} + +template +static __forceinline__ __device__ +scalar_t safe_downgrade_to_int_range(scalar_t x){ + // -100.0 does not have special meaning. This is just to make sure + // it's not within_bounds_2d or within_bounds_3d, and does not cause + // undefined behavior. See #35506. + if (x > INT_MAX-1 || x < INT_MIN || !::isfinite(static_cast(x))) + return static_cast(-100.0); + return x; +} + +template +static __forceinline__ __device__ +scalar_t compute_coordinates(scalar_t coord, int size, + GridSamplerPadding padding_mode, + bool align_corners) { + if (padding_mode == GridSamplerPadding::Border) { + // clip coordinates to image borders + coord = clip_coordinates(coord, size); + } else if (padding_mode == GridSamplerPadding::Reflection) { + // reflect coordinates by image borders + if (align_corners) { + coord = reflect_coordinates(coord, 0, 2*(size - 1)); + } else { + coord = reflect_coordinates(coord, -1, 2*size - 1); + } + // clip coordinates to image borders + coord = clip_coordinates(coord, size); + } + + coord = safe_downgrade_to_int_range(coord); + return coord; +} + +// Computes the pixel source index value for a grid coordinate +template +static __forceinline__ __device__ +scalar_t grid_sampler_compute_source_index( + scalar_t coord, + int size, + GridSamplerPadding padding_mode, + bool align_corners) { + coord = grid_sampler_unnormalize(coord, size, align_corners); + coord = compute_coordinates(coord, size, padding_mode, align_corners); + return coord; +} + +// grid_sampler_compute_source_index_set_grad works similarly to +// grid_sampler_compute_source_index except that it also returns the +// `d output / d input` via pointer argument `grad_in`. +// This is useful in the backward pass of grid_sampler. +template +static __forceinline__ __device__ +scalar_t grid_sampler_compute_source_index_set_grad( + scalar_t coord, + int size, + GridSamplerPadding padding_mode, + bool align_corners, + scalar_t *grad_in) { + scalar_t grad_clip, grad_refl; + coord = grid_sampler_unnormalize_set_grad(coord, size, align_corners, grad_in); + if (padding_mode == GridSamplerPadding::Border) { + // clip coordinates to image borders + coord = clip_coordinates_set_grad(coord, size, &grad_clip); + *grad_in = (*grad_in) * grad_clip; + } else if (padding_mode == GridSamplerPadding::Reflection) { + // reflect coordinates by image borders + if (align_corners) { + coord = reflect_coordinates_set_grad(coord, 0, 2*(size - 1), &grad_refl); + } else { + coord = reflect_coordinates_set_grad(coord, -1, 2*size - 1, &grad_refl); + } + // clip coordinates to image borders + coord = clip_coordinates_set_grad(coord, size, &grad_clip); + *grad_in = (*grad_in) * grad_refl * grad_clip; + } + + coord = safe_downgrade_to_int_range(coord); + return coord; +} + +static __forceinline__ __device__ +bool within_bounds_2d(int h, int w, int H, int W) { + return h >= 0 && h < H && w >= 0 && w < W; +} + +static __forceinline__ __device__ +bool within_bounds_3d(int d, int h, int w, int D, int H, int W) { + return d >= 0 && d < D && h >= 0 && h < H && w >= 0 && w < W; +} + +template +static __forceinline__ __device__ +scalar_t get_value_bounded( + scalar_t *data, scalar_t x, scalar_t y, int W, int H, int sW, int sH, + GridSamplerPadding padding_mode, + bool align_corners) { + + x = compute_coordinates(x, W, padding_mode, align_corners); + y = compute_coordinates(y, H, padding_mode, align_corners); + + int ix = static_cast(x); + int iy = static_cast(y); + + if (within_bounds_2d(iy, ix, H, W)) { + return data[iy * sH + ix * sW]; + } + return static_cast(0); +} + +template +static __forceinline__ __device__ +void safe_add_2d(scalar_t *data, int h, int w, + int sH, int sW, int H, int W, + scalar_t delta, + const index_t NC_offset, + const index_t memory_span) { + if (within_bounds_2d(h, w, H, W)) { + fastAtomicAdd(data, + NC_offset + h * sH + w * sW, + memory_span, + delta, + true); + } +} + +template +static __forceinline__ __device__ +void safe_add_3d(scalar_t *data, int d, int h, int w, + int sD, int sH, int sW, int D, int H, int W, + scalar_t delta, + const index_t NC_offset, + const index_t memory_span) { + if (within_bounds_3d(d, h, w, D, H, W)) { + fastAtomicAdd(data, + NC_offset + d * sD + h * sH + w * sW, + memory_span, + delta, + true); + } +} + +template +static __forceinline__ __device__ +void add_value_bounded( + scalar_t* data, scalar_t x, scalar_t y, int W, int H, int sW, int sH, + scalar_t delta, + GridSamplerPadding padding_mode, + bool align_corners, + const index_t NC_offset, + const index_t memory_span) { + + x = compute_coordinates(x, W, padding_mode, align_corners); + y = compute_coordinates(y, H, padding_mode, align_corners); + + int ix = static_cast(x); + int iy = static_cast(y); + + safe_add_2d(data, iy, ix, sH, sW, H, W, delta, NC_offset, memory_span); +} + +// Calculate the differential of the cubic convolution, i.e. `d coeff / d x` +template +static __forceinline__ __device__ +void get_cubic_coefficients_grad( + scalar_t coeffs[4], + scalar_t t) { + + // Must be the same as forward calculation in + // aten/src/ATen/native/cuda/UpSample.cuh:get_cubic_upsample_coefficients + scalar_t A = -0.75; + + scalar_t x; + x = -1 - t; // 1 < x = |-1 - tx| < 2 + coeffs[0] = (-3 * A * x - 10 * A ) * x - 8 * A; + x = -t; // x = |0 - tx| <= 1 + coeffs[1] = (-3 * (A + 2) * x - 2 * (A + 3)) * x; + x = 1 - t; // x = |1 - tx| <= 1 + coeffs[2] = (3 * (A + 2) * x - 2 * (A + 3)) * x; + x = 2 - t; // 1 < x = |2 - tx| < 2 + coeffs[3] = (3 * A * x - 10 * A) * x + 8 * A; +} + + +}} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/JitLoops.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/JitLoops.cuh new file mode 100644 index 0000000000000000000000000000000000000000..6f350c550ce93667bdf21b8b0c7d9798fdf5f15f --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/JitLoops.cuh @@ -0,0 +1,187 @@ +#pragma once + +#include + +#if AT_USE_JITERATOR() + +#include + +#include +#include +#include + +#include + +#include + +namespace at { +namespace native { + +/* Note [Jiterator] +The "jiterator" simply just-in-time compiles the same kernels that +Loops.cuh (and CUDALoops.cuh) usually build. This reduces build time, +build size, and initial CUDA context size. + +By default on non-Windows systems, it also caches compiled kernels in ~/.cache/torch/kernels. +This behavior is controlled with two environment variables: + - USE_PYTORCH_KERNEL_CACHE, if set to zero then this will disable all cache use + - PYTORCH_KERNEL_CACHE_PATH, if set specifies the folder to use for cached kernels + +The jiterator currently has some limitations, however. It cannot: + - handle math on complex datatypes + - handle kernels with scalar parameters + +These improvements will likely come soon. + +For examples of how to use the jiterator see the i1 and gcd kernel +implementations, which pass jittable strings implementing their +operations instead of the typical CUDA functors. + +To pass a runtime argument (similar to lambda captures in non-JIT kernels), +we need to pass to additional arguments to `jitted_gpu_kernel` by value. +Currently only primitive C++ types used for computation are valid. +The order of these extra arguments should be same as the order they appear +in kernel's function signature. (look at polygamma for example) + +NOTE: One big restriction being that these arguments should be after the +arguments provided by TensorIterator. Eg. While capturing `n`, where +`scalar_t x` and `scalar_t y` are provided by TensorIterator, +* foo(scalar_t x, scalar_t y, int n) works! +* foo(int n, scalar_t x, scalar_y) doesn't work +* foo(scalar_t x, int n, scalar_y) doesn't work + +*/ + +// Entrypoint for jitted GPU kernels. +// Only handles elementwise unary and binary kernels with a +// common dtype and a single output. +// NOTE: this assumes the op's iterator has a common_dtype. +// NOTE: We use std::tuple instead of parameter pack +// for `extra_args` due to following +// bug on older versions of clang +// https://bugs.llvm.org/show_bug.cgi?id=23029 +template < + char const* name, + typename return_type, + typename f_inputs_type, + int arity, + typename... Args> +void jitted_gpu_kernel( + TensorIteratorBase& iter, + const std::string& f, + at::cuda::jit::BinaryFuncVariant scalar_pos = + at::cuda::jit::BinaryFuncVariant::NoScalar, + at::opmath_type scalar_val = 0, + std::tuple extra_args = std::make_tuple()) { + // TODO: much of preamble is common to both jitted_gpu_kernel and gpu_kernel + // Maybe it could be refactored? + for (int arg = 0; arg < iter.ntensors(); arg++) { + TORCH_INTERNAL_ASSERT( + iter.device(arg).is_cuda(), + "argument ", arg, ": expected a CUDA device but found ", iter.device(arg)); + } + + if (iter.numel() == 0) { + return; + } + + if (!iter.can_use_32bit_indexing()) { + for (auto& sub_iter : iter.with_32bit_indexing()) { + jitted_gpu_kernel( + sub_iter, f, scalar_pos, scalar_val, extra_args); + } + + return; + } + + // Computes if dynamic casting is needed + // Dynamic casting is needed if an input's dtype differs from the common dtype + // or if the result dtype differs from the output's dtype + // Note: this is intentionally divergent from calling needs_dynamic_casting, + // which is more general and inspects a lambda to determine if dynamic + // casting is needed. + bool needs_dynamic_casting = false; + + // Checks output + const ScalarType return_scalar_type = c10::CppTypeToScalarType::value; + const auto dtype0 = iter.dtype(0); + if (dtype0 != return_scalar_type) { + needs_dynamic_casting = true; + } + + // Checks input(s) + const ScalarType inputs_scalar_type = c10::CppTypeToScalarType::value; + for (auto i = decltype(arity){1}; i < (arity + 1); ++i) { + const auto dtypei = iter.dtype(i); + if (dtypei != inputs_scalar_type) { + needs_dynamic_casting = true; + break; + } + } + if (scalar_pos == at::cuda::jit::BinaryFuncVariant::NoScalar) { + // NOTE: With `scalar_pos=NoScalar`,`scalar_val` is not used + // for computation in the generated code and hence we pass a dummy + // value of `0`. + jitted_gpu_kernel_impl< + /*name*/ name, + /*return_type=*/return_type, + /*f_inputs_type=*/f_inputs_type, + arity, + at::cuda::jit::BinaryFuncVariant::NoScalar>( + iter, f, needs_dynamic_casting, /*scalar_val=*/scalar_val, extra_args); + } else if (scalar_pos == at::cuda::jit::BinaryFuncVariant::RhsScalar) { + jitted_gpu_kernel_impl< + /*name*/ name, + /*return_type=*/return_type, + /*f_inputs_type=*/f_inputs_type, + arity, + at::cuda::jit::BinaryFuncVariant::RhsScalar>( + iter, + f, + needs_dynamic_casting, + scalar_val, + extra_args); + + } else { + jitted_gpu_kernel_impl< + /*name*/ name, + /*return_type=*/return_type, + /*f_inputs_type=*/f_inputs_type, + arity, + at::cuda::jit::BinaryFuncVariant::LhsScalar>( + iter, + f, + needs_dynamic_casting, + scalar_val, + extra_args); + } +} + +// TODO: support runtime state capture similar to `jitted_gpu_kernel`. +template +void opmath_jitted_gpu_kernel_with_scalars(TensorIteratorBase& iter, const std::string& f) { + TORCH_INTERNAL_ASSERT(iter.ntensors() == 3); + //currently jiterator only handles binary functions where both inputs are of the same type (f_inputs_type) + using opmath_t = at::opmath_type; + if (iter.is_cpu_scalar(1)) { + auto scalar_val = iter.scalar_value(1); + iter.remove_operand(1); + // TODO: When all kernels that use gpu_kernel_with_scalars are + // ported to structured, this device guard can be deleted. This + // works around incorrect device guard generation for pre-structured + // kernels device guards, but structured kernels do it right and + // we can assume the device is already set correctly + const OptionalDeviceGuard device_guard(iter.device(1)); + jitted_gpu_kernel(iter, f, at::cuda::jit::BinaryFuncVariant::LhsScalar, scalar_val); + } else if (iter.is_cpu_scalar(2)) { + auto scalar_val = iter.scalar_value(2); + iter.remove_operand(2); + jitted_gpu_kernel(iter, f, at::cuda::jit::BinaryFuncVariant::RhsScalar, scalar_val); + } else { + jitted_gpu_kernel(iter, f); + } +} + +}} // at::native + +#endif // AT_USE_JITERATOR() diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/KernelUtils.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/KernelUtils.cuh new file mode 100644 index 0000000000000000000000000000000000000000..d07f54093e8136234cba6879f2dfff17a6d0e5a3 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/KernelUtils.cuh @@ -0,0 +1,149 @@ +#pragma once +#include + +#if !(defined(USE_ROCM) || ((defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)))) +#include +#endif + +namespace at { +namespace native { + +__device__ __forceinline__ size_t +idx(const size_t nc, + const size_t height, + const size_t width, + const size_t h, + const size_t w) { + return (nc * height + h) * width + w; +} + +// for channels-last +__device__ __forceinline__ size_t +idx_cl( + const size_t n, const size_t h, const size_t w, const size_t c, + const size_t height, const size_t width, const size_t channel +) { + return ((n * height + h) * width + w) * channel + c; +} + +// fastSpecializedAtomicAdd (and fastAtomicAdd) are an optimization +// that speed up half-precision atomics. The situation with half +// precision atomics is that we have a slow __half atomic, and +// a fast vectored __half2 atomic (this can be worth up to a 6x +// speedup, see https://github.com/pytorch/pytorch/pull/21879). +// We can convert a __half atomic into a __half2 atomic by simply +// pairing the __half with a zero entry on the left/right depending +// on alignment... but only if this wouldn't cause an out of bounds +// access! Thus, you must specify tensor and numel so we can check +// if you would be out-of-bounds and use a plain __half atomic if +// you would be. +template < + typename scalar_t, + typename index_t, + typename std::enable_if::value>::type* = + nullptr> +__device__ __forceinline__ void fastSpecializedAtomicAdd( + scalar_t* tensor, + index_t index, + const index_t numel, + scalar_t value) { +#if ( \ + (defined(USE_ROCM)) || \ + (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700))) + gpuAtomicAddNoReturn( + reinterpret_cast(tensor) + index, + static_cast(value)); +#else + // Accounts for the chance tensor falls on an odd 16 bit alignment (ie, not 32 bit aligned) + __half* target_addr = reinterpret_cast<__half*>(tensor + index); + bool low_byte = (reinterpret_cast(target_addr) % sizeof(__half2) == 0); + + if (low_byte && index < (numel - 1)) { + __half2 value2; + value2.x = static_cast<__half>(value); + value2.y = __int2half_rz(0); + atomicAdd(reinterpret_cast<__half2*>(target_addr), value2); + + } else if (!low_byte && index > 0) { + __half2 value2; + value2.x = __int2half_rz(0); + value2.y = static_cast<__half>(value); + atomicAdd(reinterpret_cast<__half2*>(target_addr - 1), value2); + + } else { + atomicAdd( + reinterpret_cast<__half*>(tensor) + index, static_cast<__half>(value)); + } +#endif +} + +template < + typename scalar_t, + typename index_t, + typename std::enable_if::value>::type* = + nullptr> +__device__ __forceinline__ void fastSpecializedAtomicAdd( + scalar_t* tensor, + index_t index, + const index_t numel, + scalar_t value) { +#if ( \ + (defined(USE_ROCM)) || \ + (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800))) + gpuAtomicAddNoReturn( + reinterpret_cast(tensor) + index, + static_cast(value)); +#else + // Accounts for the chance tensor falls on an odd 16 bit alignment (ie, not 32 bit aligned) + __nv_bfloat16* target_addr = reinterpret_cast<__nv_bfloat16*>(tensor + index); + bool low_byte = (reinterpret_cast(target_addr) % sizeof(__nv_bfloat162) == 0); + + if (low_byte && index < (numel - 1)) { + __nv_bfloat162 value2; + value2.x = *reinterpret_cast<__nv_bfloat16*>(&value); + value2.y = __int2bfloat16_rz(0); + atomicAdd(reinterpret_cast<__nv_bfloat162*>(target_addr), value2); + + } else if (!low_byte && index > 0) { + __nv_bfloat162 value2; + value2.x = __int2bfloat16_rz(0); + value2.y = *reinterpret_cast<__nv_bfloat16*>(&value); + atomicAdd(reinterpret_cast<__nv_bfloat162*>(target_addr - 1), value2); + + } else { + atomicAdd( + reinterpret_cast<__nv_bfloat16*>(tensor) + index, *reinterpret_cast<__nv_bfloat16*>(&value)); + } +#endif +} + + +template < + typename scalar_t, + typename index_t, + typename std::enable_if::value && !std::is_same::value >::type* = + nullptr> +__device__ __forceinline__ void fastSpecializedAtomicAdd( + scalar_t* tensor, + index_t index, + const index_t numel, + scalar_t value) { + gpuAtomicAddNoReturn(tensor + index, value); +} + +template +__device__ __forceinline__ void fastAtomicAdd( + scalar_t* tensor, + index_t index, + const index_t numel, + scalar_t value, + bool fast_atomics) { + if (fast_atomics) { + fastSpecializedAtomicAdd(tensor, index, numel, value); + } else { + gpuAtomicAddNoReturn(tensor + index, value); + } +} + +} // namespace native +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Math.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Math.cuh new file mode 100644 index 0000000000000000000000000000000000000000..5188801f12c60b67d2da3486d02ced455a17231c --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Math.cuh @@ -0,0 +1,3375 @@ +#pragma once + +#include +#include +#include +#include + +namespace at { +namespace native { +// See note [Jiterator] +// TODO: elaborate in this comment on the structure of math.cuh +#if AT_USE_JITERATOR() + +const auto ndtri_string = jiterator_stringify( + /* + * This function is derived from the implementation of the digamma function in the Cephes Math Library. + * See note [3-Clause BSD License for the Cephes Math Library]. + * + * Evaluates polynomial of degree N: + * + * 2 N + * y = C + C x + C x +...+ C x + * 0 1 2 N + * + * Coefficients are stored in reverse order: + * + * coef[0] = C , ..., coef[N] = C . + * N 0 + */ + template + T polevl(const T x, const T A[], const int len) { + // NOTE: This `polevl` is different from other `polevl` + // implementation (in PyTorch) which expect the `len` to be + // `len(A) - 1` instead of `len(A)`. + T result = 0; + for (int i = 0; i < len; ++i) { + result = result * x + A[i]; + } + return result; + } + + /* + * This function is derived from the implementation of the i1e function in the Cephes Math Library. + * See note [3-Clause BSD License for the Cephes Math Library]. + * + * Computes the argument, x, for which the area under the Gaussian probability density function + * (integrated from minus infinity to x) is equal to y. + */ + template + T ndtri(T y0) { + + constexpr T zero = 0; + constexpr T one = 1; + + // Handles special cases + if (y0 == zero) { + return NEG_INFINITY; + } + if (y0 == one) { + return POS_INFINITY; + } + if (y0 < zero || y0 > one) { + return NAN; + } + + bool code = true; + T y = y0; + // Note: the constant 0.135... is equal to exp(-2) + if (y > one - T{0.13533528323661269189}) { + y = one - y; + code = false; + } + + if (y > T{0.13533528323661269189}) { + /* approximation for 0 <= |y - 0.5| <= 3/8 */ + static const T P0[5] = { + -5.99633501014107895267E1, + 9.80010754185999661536E1, + -5.66762857469070293439E1, + 1.39312609387279679503E1, + -1.23916583867381258016E0, + }; + + static const T Q0[9] = { + 1.00000000000000000000E0, + 1.95448858338141759834E0, + 4.67627912898881538453E0, + 8.63602421390890590575E1, + -2.25462687854119370527E2, + 2.00260212380060660359E2, + -8.20372256168333339912E1, + 1.59056225126211695515E1, + -1.18331621121330003142E0, + }; + + /* sqrt(2pi) */ + constexpr T s2pi = 2.50662827463100050242E0; + + y = y - T{0.5}; + const T y2 = y * y; + T x = y + y * (y2 * polevl(y2, P0, int{5}) / polevl(y2, Q0, int{9})); + return x * s2pi; + } + + T x = sqrt(T{-2.} * log(y)); + const T x0 = x - (log(x) / x); + + const T z = one / x; + T x1; + + /* y > exp(-32) = 1.2664165549e-14 */ + if (x < T{8.0}) { + /* Approximation for interval z = sqrt(-2 log y ) between 2 and 8 + * i.e., y between exp(-2) = .135 and exp(-32) = 1.27e-14. + */ + static const T P1[9] = { + 4.05544892305962419923E0, + 3.15251094599893866154E1, + 5.71628192246421288162E1, + 4.40805073893200834700E1, + 1.46849561928858024014E1, + 2.18663306850790267539E0, + -1.40256079171354495875E-1, + -3.50424626827848203418E-2, + -8.57456785154685413611E-4, + }; + + static const T Q1[9] = { + 1.00000000000000000000E0, + 1.57799883256466749731E1, + 4.53907635128879210584E1, + 4.13172038254672030440E1, + 1.50425385692907503408E1, + 2.50464946208309415979E0, + -1.42182922854787788574E-1, + -3.80806407691578277194E-2, + -9.33259480895457427372E-4, + }; + + x1 = z * polevl(z, P1, int{9}) / polevl(z, Q1, int{9}); + } else { + /* Approximation for interval z = sqrt(-2 log y ) between 8 and 64 + * i.e., y between exp(-32) = 1.27e-14 and exp(-2048) = 3.67e-890. + */ + static const T P2[9] = { + 3.23774891776946035970E0, + 6.91522889068984211695E0, + 3.93881025292474443415E0, + 1.33303460815807542389E0, + 2.01485389549179081538E-1, + 1.23716634817820021358E-2, + 3.01581553508235416007E-4, + 2.65806974686737550832E-6, + 6.23974539184983293730E-9, + }; + + static const T Q2[9] = { + 1.00000000000000000000E0, + 6.02427039364742014255E0, + 3.67983563856160859403E0, + 1.37702099489081330271E0, + 2.16236993594496635890E-1, + 1.34204006088543189037E-2, + 3.28014464682127739104E-4, + 2.89247864745380683936E-6, + 6.79019408009981274425E-9, + }; + + x1 = z * polevl(z, P2, int{9}) / polevl(z, Q2, int{9}); + } + + x = x0 - x1; + return (!code) ? x : -x; + } +); // ndtri_string + +const auto log_ndtr_string = jiterator_stringify( + template + T log_ndtr(T x) { + constexpr T SQRT1_2{0.707106781186547524400844362104849039}; // 1/sqrt(2) + T t = x * SQRT1_2; + if (x < T{-1.0}) { + return log(erfcx(-t) / 2) - t * t; + } else { + return log1p(-erfc(t) / 2); + } + } +); // log_ndtr_string + +const auto gcd_string = jiterator_stringify( + template + T gcd(const T a_in, const T b_in) { + T a = abs(a_in); + T b = abs(b_in); + + while (a != T{0}) { + T c = a; + a = b % a; + b = c; + } + + return b; + } +); // gcd_string + +const auto lcm_string = jiterator_stringify( + template + T gcd(const T a_in, const T b_in) { + T a = abs(a_in); + T b = abs(b_in); + + while (a != T{0}) { + T c = a; + a = b % a; + b = c; + } + + return b; + } + + template + T lcm(const T a, const T b) { + T g = gcd(a, b); + return (g == T{0}) ? T{0} : abs(a / g * b); + } +); // lcm_string + +/* + * For licensing information, please refer to the cpu implementation located in "ATen/native/Math.h". + */ +// [C++ Standard Reference: Gamma Function] https://en.cppreference.com/w/cpp/numeric/math/tgamma +const auto digamma_string = jiterator_stringify( + template + T digamma(T x) { + static const double PI_f64 = 3.14159265358979323846; + + // Short-circuits if x is +/- 0 and returns -/+ ∞ per the C++ standard + if (x == 0) { + return copysign(POS_INFINITY, -x); + } + + T result = 0; + if (x < 0) { + // Short-circuits if x is a negative integer and returns NaN + // per the C++ standard + const bool x_is_integer = (x == trunc(x)); + if (x_is_integer) { + return NAN; + } + + // Extracts the fractional part of x as r, since tan(pi * r) is more numerically + // accurate than tan(pi * x). While these operations are mathematically equivalent + // since both x and r are in radians and tan() has a periodicity of pi, in practice + // the computation of pi * x is a source of error (when |x| > 1). + double q, r; + r = modf(static_cast(x), &q); + result = - PI_f64 / tan(PI_f64 * r); + x = 1 - x; + } + + while (x < T{10}) { + result -= T{1} / x; + x += T{1}; + } + + if (x == T{10}) { + return result + T{2.25175258906672110764}; + } + + T y = 0; + if (x < T{1.0e17}) { + const T A[] = { + 8.33333333333333333333E-2, + -2.10927960927960927961E-2, + 7.57575757575757575758E-3, + -4.16666666666666666667E-3, + 3.96825396825396825397E-3, + -8.33333333333333333333E-3, + 8.33333333333333333333E-2, + }; + + + T z = T{1} / (x * x); + + T polevl_result = 0; + for (int i = 0; i <= 6; i++) { + polevl_result = polevl_result * z + A[i]; + } + y = z * polevl_result; + } + + return log(x) - (T{0.5} / x) - y + result; + } +); // digamma_string + +/* + * This function is derived from the implementation of the zeta function in the Cephes Math Library. + * See note [3-Clause BSD License for the Cephes Math Library]. + */ +const auto zeta_string = jiterator_stringify( + template + T zeta(T x, T q) { + const T MACHEP{1.11022302462515654042E-16}; + constexpr T zero{0}; + constexpr T half{0.5}; + constexpr T one{1}; + static const T A[] = { + 12.0, + -720.0, + 30240.0, + -1209600.0, + 47900160.0, + -1.8924375803183791606e9, /*1.307674368e12/691*/ + 7.47242496e10, + -2.950130727918164224e12, /*1.067062284288e16/3617*/ + 1.1646782814350067249e14, /*5.109094217170944e18/43867*/ + -4.5979787224074726105e15, /*8.028576626982912e20/174611*/ + 1.8152105401943546773e17, /*1.5511210043330985984e23/854513*/ + -7.1661652561756670113e18 /*1.6938241367317436694528e27/236364091*/ + }; + + int i = 0; + T a, b, k, s, t, w; + + // Short-circuits x -> +infty + if (x == one) { + return POS_INFINITY; + } + + // Short-circuits x < 1 -> NaN + if (x < one) { + return NAN; + } + + // Short-circuits negative q integers map to +infty, + // negative q non-integers map to NaN + if (q <= zero) { + if (q == floor(q)) { + return POS_INFINITY; + } + if (x != floor(x)) { + return NAN; + } + } + + s = pow(q, -x); + a = q; + i = 0; + b = zero; + while ((i < 9) || (a <= T{9.0})) { + i += 1; + a += one; + b = pow(a, -x); + s += b; + if ((-MACHEP * s < b) && (b < MACHEP * s)) { + return s; + } + }; + + w = a; + s += b * w / (x - one); + s -= half * b; + a = one; + k = zero; + for (int i = 0; i < 12; i++) { + a *= x + k; + b /= w; + t = a * b / A[i]; + s = s + t; + t = fabs(t / s); + + if (t < MACHEP) { + return s; + } + + k += one; + a *= x + k; + b /= w; + k += one; + } + + return s; + } +); // zeta_string + +const auto trigamma_string = jiterator_stringify( + template + T trigamma(T x) { + const T PI{3.14159265358979323846}; + T sign = 1; + T result = 0; + + if (x < T{0.5}) { + sign = -1; + T sin_pi_x = sin(PI * x); + result -= (PI * PI) / (sin_pi_x * sin_pi_x); + x = 1 - x; + } + + for (int i = 0; i < 6; ++i) { + result += T{1} / (x * x); + x += 1; + } + + const T one{1}; + const T ixx = one / (x*x); + result += (one + one / (T{2}*x) + ixx * (one/T{6} - ixx * (one/T{30} - ixx * (one/T{42})))) / x; + return sign * result; +} +); // trigamma_string + +const auto lgamma_string = jiterator_stringify( + template + T lgamma_kernel(T a) { + return lgamma(a); + } +); // lgamma_string + +const auto polygamma_string = zeta_string + jiterator_stringify( + template + T polygamma(T x, int n) { + // already blocked if n <= 1 + const auto one = T{1}; + return ((n % 2) ? one : -one) * exp(lgamma(static_cast(n) + one)) * + zeta(static_cast(n + 1), x); + } +); // polygamma_string + +const auto exp2_string = jiterator_stringify( + template + T exp2_impl(T a) { + return exp2(a); + } + + namespace std { template class complex; } + template + std::complex exp2_impl(std::complex x) { + // There is no std::exp2 overload for complex, so instead + // use the identity 2^x = e^(ln(2) * x) + const auto ln_2 = static_cast(0.693147180559945309417232121458176); + return exp(ln_2 * x); + } + + template + T exp2_kernel(T a) { + return exp2_impl(a); + } +); // exp2_string + +const auto erfc_string = jiterator_stringify( + template + T erfc_kernel(T a) { + return erfc(a); + } +); // erfc_string + +const auto erfinv_string = jiterator_stringify( + template + T erfinv_kernel(T a) { + return erfinv(a); + } +); // erfinv_string + +const auto entr_string = jiterator_stringify( + template + T entr(T a) { + if (a != a) { + return a; + } + + if (a > 0) { + return -a * log(a); + } + + if (a == 0) { + return 0; + } + + return NEG_INFINITY; + } +); // entr_string + +// NOTE: `kaiser_window_string` depends on `i0_string` +// for its implementation. +const auto i0_string = jiterator_stringify( + template + T chbevl(T x, const T array[], const int len) { + + T b0, b1, b2; + + b0 = array[0]; + b1 = 0; + + for (int i = 1; i < len; ++i) { + b2 = b1; + b1 = b0; + b0 = x * b1 - b2 + array[i]; + } + + return T{0.5} * (b0 - b2); + } + + template + T i0(T _x) { + T x = fabs(_x); + + if (x <= T{8.0}) { + /* Chebyshev coefficients for exp(-x) I0(x) + * in the interval [0,8]. + * + * lim(x->0){ exp(-x) I0(x) } = 1. + */ + static const T A[] = { + -4.41534164647933937950E-18, 3.33079451882223809783E-17, + -2.43127984654795469359E-16, 1.71539128555513303061E-15, + -1.16853328779934516808E-14, 7.67618549860493561688E-14, + -4.85644678311192946090E-13, 2.95505266312963983461E-12, + -1.72682629144155570723E-11, 9.67580903537323691224E-11, + -5.18979560163526290666E-10, 2.65982372468238665035E-9, + -1.30002500998624804212E-8, 6.04699502254191894932E-8, + -2.67079385394061173391E-7, 1.11738753912010371815E-6, + -4.41673835845875056359E-6, 1.64484480707288970893E-5, + -5.75419501008210370398E-5, 1.88502885095841655729E-4, + -5.76375574538582365885E-4, 1.63947561694133579842E-3, + -4.32430999505057594430E-3, 1.05464603945949983183E-2, + -2.37374148058994688156E-2, 4.93052842396707084878E-2, + -9.49010970480476444210E-2, 1.71620901522208775349E-1, + -3.04682672343198398683E-1, 6.76795274409476084995E-1}; + + T y = (x / T{2.0}) - T{2.0}; + return exp(x) * chbevl(y, A, int{30}); + } + + // Handles x > 8 case + /* Chebyshev coefficients for exp(-x) sqrt(x) I0(x) + * in the inverted interval [8,infinity]. + * + * lim(x->inf){ exp(-x) sqrt(x) I0(x) } = 1/sqrt(2pi). + */ + const T B[] = { + -7.23318048787475395456E-18, -4.83050448594418207126E-18, + 4.46562142029675999901E-17, 3.46122286769746109310E-17, + -2.82762398051658348494E-16, -3.42548561967721913462E-16, + 1.77256013305652638360E-15, 3.81168066935262242075E-15, + -9.55484669882830764870E-15, -4.15056934728722208663E-14, + 1.54008621752140982691E-14, 3.85277838274214270114E-13, + 7.18012445138366623367E-13, -1.79417853150680611778E-12, + -1.32158118404477131188E-11, -3.14991652796324136454E-11, + 1.18891471078464383424E-11, 4.94060238822496958910E-10, + 3.39623202570838634515E-9, 2.26666899049817806459E-8, + 2.04891858946906374183E-7, 2.89137052083475648297E-6, + 6.88975834691682398426E-5, 3.36911647825569408990E-3, + 8.04490411014108831608E-1}; + + return (exp(x) * chbevl(T{32.0} / x - T{2.0}, B, int{25})) / sqrt(x); + } +); // i0_string + +const auto i1_string = jiterator_stringify( + template + T chbevl(const T x, const T array[], const int len) { + T b0, b1, b2; + + b0 = array[0]; + b1 = 0; + + for (int i = 1; i < len; ++i) { + b2 = b1; + b1 = b0; + b0 = x * b1 - b2 + array[i]; + } + + return T{0.5} * (b0 - b2); + } + + template + T i1(T _x) { + const T x = fabs(_x); + + if (x <= T{8.0}) { + // Chebyshev coefficients for exp(-x) i1(x) in the internal [0, 8] + // lim(x->0){ exp(-x) i1(x) / x } = 1/2 + static const T coefficients[] = { + 2.77791411276104639959E-18, -2.11142121435816608115E-17, + 1.55363195773620046921E-16, -1.10559694773538630805E-15, + 7.60068429473540693410E-15, -5.04218550472791168711E-14, + 3.22379336594557470981E-13, -1.98397439776494371520E-12, + 1.17361862988909016308E-11, -6.66348972350202774223E-11, + 3.62559028155211703701E-10, -1.88724975172282928790E-9, + 9.38153738649577178388E-9, -4.44505912879632808065E-8, + 2.00329475355213526229E-7, -8.56872026469545474066E-7, + 3.47025130813767847674E-6, -1.32731636560394358279E-5, + 4.78156510755005422638E-5, -1.61760815825896745588E-4, + 5.12285956168575772895E-4, -1.51357245063125314899E-3, + 4.15642294431288815669E-3, -1.05640848946261981558E-2, + 2.47264490306265168283E-2, -5.29459812080949914269E-2, + 1.02643658689847095384E-1, -1.76416518357834055153E-1, + 2.52587186443633654823E-1}; + const T y = x / T{2.0} - T{2.0}; + const T out = exp(x) * x * chbevl(y, coefficients, int{29}); + return (_x < T{0.0}) ? -out : out; + } + + // Chebyshev coefficients for exp(-x) sqrt(x) i1(x) + // in the inverted interval [8, infinity] + // lim(x->inf){ exp(-x) sqrt(x) i1(x) } = 1/sqrt(2pi) + static const T coefficients[] = { + 7.51729631084210481353E-18, 4.41434832307170791151E-18, + -4.65030536848935832153E-17, -3.20952592199342395980E-17, + 2.96262899764595013876E-16, 3.30820231092092828324E-16, + -1.88035477551078244854E-15, -3.81440307243700780478E-15, + 1.04202769841288027642E-14, 4.27244001671195135429E-14, + -2.10154184277266431302E-14, -4.08355111109219731823E-13, + -7.19855177624590851209E-13, 2.03562854414708950722E-12, + 1.41258074366137813316E-11, 3.25260358301548823856E-11, + -1.89749581235054123450E-11, -5.58974346219658380687E-10, + -3.83538038596423702205E-9, -2.63146884688951950684E-8, + -2.51223623787020892529E-7, -3.88256480887769039346E-6, + -1.10588938762623716291E-4, -9.76109749136146840777E-3, + 7.78576235018280120474E-1}; + const T out = (exp(x) * chbevl(T{32.} / x - T{2.}, coefficients, int{25})) / sqrt(x); + return (_x < T{0.}) ? -out : out; + } +); // i1_string + +const auto i1e_string = jiterator_stringify( + template + T chbevl(const T x, const T array[], const int len) { + T b0, b1, b2; + + b0 = array[0]; + b1 = 0; + + for (int i = 1; i < len; ++i) { + b2 = b1; + b1 = b0; + b0 = x * b1 - b2 + array[i]; + } + + return T{0.5} * (b0 - b2); + } + + // See double and float instantiations below + template + T i1e(T _x) { } + + // Double specialization (uses different coefficients than the float version) + template<> + double i1e(double _x) { + const double x = fabs(_x); + if (x <= double{8.}) { + // Chebyshev double coefficients for exp(-x) i1(x) in the interval [0,8]. + // Note: lim(x->0){ exp(-x) i1(x) / x } = 1/2. + static const double coefficients[] = { + 2.77791411276104639959E-18, -2.11142121435816608115E-17, + 1.55363195773620046921E-16, -1.10559694773538630805E-15, + 7.60068429473540693410E-15, -5.04218550472791168711E-14, + 3.22379336594557470981E-13, -1.98397439776494371520E-12, + 1.17361862988909016308E-11, -6.66348972350202774223E-11, + 3.62559028155211703701E-10, -1.88724975172282928790E-9, + 9.38153738649577178388E-9, -4.44505912879632808065E-8, + 2.00329475355213526229E-7, -8.56872026469545474066E-7, + 3.47025130813767847674E-6, -1.32731636560394358279E-5, + 4.78156510755005422638E-5, -1.61760815825896745588E-4, + 5.12285956168575772895E-4, -1.51357245063125314899E-3, + 4.15642294431288815669E-3, -1.05640848946261981558E-2, + 2.47264490306265168283E-2, -5.29459812080949914269E-2, + 1.02643658689847095384E-1, -1.76416518357834055153E-1, + 2.52587186443633654823E-1}; + const double y = x / double{2.} - double{2.}; + const double out = chbevl(y, coefficients, int{29}) * x; + return (_x < 0.) ? -out : out; + } + + // Chebyshev coefficients for exp(-x) sqrt(x) i1(x) + // in the inverted interval (8, infinity]. + // Note: lim(x->inf){ exp(-x) sqrt(x) i1(x) } = 1/sqrt(2pi). + // TODO: what's an "inverted interval"? Open on the left + // and closed on the right? + static const double coefficients[] = { + 7.51729631084210481353E-18, 4.41434832307170791151E-18, + -4.65030536848935832153E-17, -3.20952592199342395980E-17, + 2.96262899764595013876E-16, 3.30820231092092828324E-16, + -1.88035477551078244854E-15, -3.81440307243700780478E-15, + 1.04202769841288027642E-14, 4.27244001671195135429E-14, + -2.10154184277266431302E-14, -4.08355111109219731823E-13, + -7.19855177624590851209E-13, 2.03562854414708950722E-12, + 1.41258074366137813316E-11, 3.25260358301548823856E-11, + -1.89749581235054123450E-11, -5.58974346219658380687E-10, + -3.83538038596423702205E-9, -2.63146884688951950684E-8, + -2.51223623787020892529E-7, -3.88256480887769039346E-6, + -1.10588938762623716291E-4, -9.76109749136146840777E-3, + 7.78576235018280120474E-1}; + + const double out = chbevl(double{32.} / x - double{2.}, coefficients, int{25}) / sqrt(x); + return (_x < double{0.}) ? -out : out; + } + + // Float specialization (uses different coefficients than the double version) + template<> + float i1e(float _x) { + const float x = fabsf(_x); + if (x <= float{8.}) { + // Chebyshev double coefficients for exp(-x) i1(x) in the interval [0,8]. + // Note: lim(x->0){ exp(-x) i1(x) / x } = 1/2. + static const float coefficients[] = { + 9.38153738649577178388E-9f, + -4.44505912879632808065E-8f, + 2.00329475355213526229E-7f, + -8.56872026469545474066E-7f, + 3.47025130813767847674E-6f, + -1.32731636560394358279E-5f, + 4.78156510755005422638E-5f, + -1.61760815825896745588E-4f, + 5.12285956168575772895E-4f, + -1.51357245063125314899E-3f, + 4.15642294431288815669E-3f, + -1.05640848946261981558E-2f, + 2.47264490306265168283E-2f, + -5.29459812080949914269E-2f, + 1.02643658689847095384E-1f, + -1.76416518357834055153E-1f, + 2.52587186443633654823E-1f}; + const float y = x / float{2.} - float{2.}; + const float out = chbevl(y, coefficients, int{17}) * x; + return (_x < 0.) ? -out : out; + } + + // Chebyshev coefficients for exp(-x) sqrt(x) i1(x) + // in the inverted interval (8, infinity]. + // Note: lim(x->inf){ exp(-x) sqrt(x) i1(x) } = 1/sqrt(2pi). + // TODO: what's an "inverted interval"? Open on the left + // and closed on the right? + static const float coefficients[] = { + -3.83538038596423702205E-9f, + -2.63146884688951950684E-8f, + -2.51223623787020892529E-7f, + -3.88256480887769039346E-6f, + -1.10588938762623716291E-4f, + -9.76109749136146840777E-3f, + 7.78576235018280120474E-1f}; + + const float out = chbevl(float{32.} / x - float{2.}, coefficients, int{7}) / sqrt(x); + return (_x < float{0.}) ? -out : out; + } +); // i1e_string + +const auto kaiser_window_string = i0_string + jiterator_stringify( + template + T kaiser_window(T a, T inv_alpha, T beta, T inv_i0_beta) { + T x = a * inv_alpha - T{1}; + T y = max(T{0}, T{1} - x * x); + return i0(beta * sqrt(y)) * inv_i0_beta; + } +); // kaiser_window_string + +const auto sinc_string = jiterator_stringify( + template + T sinc(T a) { + if (a == T(0)) { + return T(1); + } else { + constexpr T pi = T(3.14159265358979323846L); + T product = pi * a; + return std::sin(product) / product; + } + } +); // sinc_string + +const auto erfcx_string = jiterator_stringify( + /* The next function is taken from http://ab-initio.mit.edu/Faddeev */ + + /* Copyright (c) 2012 Massachusetts Institute of Technology + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + + /* erfcx(x) = exp(x^2) erfc(x) function, for real x, written by + Steven G. Johnson, October 2012. + + This function combines a few different ideas. + + First, for x > 50, it uses a continued-fraction expansion (same as + for the Faddeeva function, but with algebraic simplifications for z=i*x). + + Second, for 0 <= x <= 50, it uses Chebyshev polynomial approximations, + but with two twists: + + a) It maps x to y = 4 / (4+x) in [0,1]. This simple transformation, + inspired by a similar transformation in the octave-forge/specfun + erfcx by Soren Hauberg, results in much faster Chebyshev convergence + than other simple transformations I have examined. + + b) Instead of using a single Chebyshev polynomial for the entire + [0,1] y interval, we break the interval up into 100 equal + subintervals, with a switch/lookup table, and use much lower + degree Chebyshev polynomials in each subinterval. This greatly + improves performance in my tests. + + For x < 0, we use the relationship erfcx(-x) = 2 exp(x^2) - erfc(x), + with the usual checks for overflow etcetera. + + Performance-wise, it seems to be substantially faster than either + the SLATEC DERFC function [or an erfcx function derived therefrom] + or Cody's CALERF function (from netlib.org/specfun), while + retaining near machine precision in accuracy. + */ + + /* Given y100 = 100 * y, where y = 4 / (4 + x) for x >= 0, compute erfc(x). + + Uses a look-up table of 100 different Chebyshev polynomials + for y intervals [0,0.01], [0.01,0.02], ...., [0.99,1], generated + with the help of Maple and a little shell script. This allows + the Chebyshev polynomials to be of significantly lower degree (about 1/4) + compared to fitting the whole [0,1] interval with a single polynomial. + */ + + // TODO: review if this is computing in double when given a float input + template + T erfcx_y100(T y100) { + switch (static_cast(y100)) { + case 0: { + T t = 2*y100 - 1; + return 0.70878032454106438663e-3 + (0.71234091047026302958e-3 + (0.35779077297597742384e-5 + (0.17403143962587937815e-7 + (0.81710660047307788845e-10 + (0.36885022360434957634e-12 + 0.15917038551111111111e-14 * t) * t) * t) * t) * t) * t; + } + case 1: { + T t = 2*y100 - 3; + return 0.21479143208285144230e-2 + (0.72686402367379996033e-3 + (0.36843175430938995552e-5 + (0.18071841272149201685e-7 + (0.85496449296040325555e-10 + (0.38852037518534291510e-12 + 0.16868473576888888889e-14 * t) * t) * t) * t) * t) * t; + } + case 2: { + T t = 2*y100 - 5; + return 0.36165255935630175090e-2 + (0.74182092323555510862e-3 + (0.37948319957528242260e-5 + (0.18771627021793087350e-7 + (0.89484715122415089123e-10 + (0.40935858517772440862e-12 + 0.17872061464888888889e-14 * t) * t) * t) * t) * t) * t; + } + case 3: { + T t = 2*y100 - 7; + return 0.51154983860031979264e-2 + (0.75722840734791660540e-3 + (0.39096425726735703941e-5 + (0.19504168704300468210e-7 + (0.93687503063178993915e-10 + (0.43143925959079664747e-12 + 0.18939926435555555556e-14 * t) * t) * t) * t) * t) * t; + } + case 4: { + T t = 2*y100 - 9; + return 0.66457513172673049824e-2 + (0.77310406054447454920e-3 + (0.40289510589399439385e-5 + (0.20271233238288381092e-7 + (0.98117631321709100264e-10 + (0.45484207406017752971e-12 + 0.20076352213333333333e-14 * t) * t) * t) * t) * t) * t; + } + case 5: { + T t = 2*y100 - 11; + return 0.82082389970241207883e-2 + (0.78946629611881710721e-3 + (0.41529701552622656574e-5 + (0.21074693344544655714e-7 + (0.10278874108587317989e-9 + (0.47965201390613339638e-12 + 0.21285907413333333333e-14 * t) * t) * t) * t) * t) * t; + } + case 6: { + T t = 2*y100 - 13; + return 0.98039537275352193165e-2 + (0.80633440108342840956e-3 + (0.42819241329736982942e-5 + (0.21916534346907168612e-7 + (0.10771535136565470914e-9 + (0.50595972623692822410e-12 + 0.22573462684444444444e-14 * t) * t) * t) * t) * t) * t; + } + case 7: { + T t = 2*y100 - 15; + return 0.11433927298290302370e-1 + (0.82372858383196561209e-3 + (0.44160495311765438816e-5 + (0.22798861426211986056e-7 + (0.11291291745879239736e-9 + (0.53386189365816880454e-12 + 0.23944209546666666667e-14 * t) * t) * t) * t) * t) * t; + } + case 8: { + T t = 2*y100 - 17; + return 0.13099232878814653979e-1 + (0.84167002467906968214e-3 + (0.45555958988457506002e-5 + (0.23723907357214175198e-7 + (0.11839789326602695603e-9 + (0.56346163067550237877e-12 + 0.25403679644444444444e-14 * t) * t) * t) * t) * t) * t; + } + case 9: { + T t = 2*y100 - 19; + return 0.14800987015587535621e-1 + (0.86018092946345943214e-3 + (0.47008265848816866105e-5 + (0.24694040760197315333e-7 + (0.12418779768752299093e-9 + (0.59486890370320261949e-12 + 0.26957764568888888889e-14 * t) * t) * t) * t) * t) * t; + } + case 10: { + T t = 2*y100 - 21; + return 0.16540351739394069380e-1 + (0.87928458641241463952e-3 + (0.48520195793001753903e-5 + (0.25711774900881709176e-7 + (0.13030128534230822419e-9 + (0.62820097586874779402e-12 + 0.28612737351111111111e-14 * t) * t) * t) * t) * t) * t; + } + case 11: { + T t = 2*y100 - 23; + return 0.18318536789842392647e-1 + (0.89900542647891721692e-3 + (0.50094684089553365810e-5 + (0.26779777074218070482e-7 + (0.13675822186304615566e-9 + (0.66358287745352705725e-12 + 0.30375273884444444444e-14 * t) * t) * t) * t) * t) * t; + } + case 12: { + T t = 2*y100 - 25; + return 0.20136801964214276775e-1 + (0.91936908737673676012e-3 + (0.51734830914104276820e-5 + (0.27900878609710432673e-7 + (0.14357976402809042257e-9 + (0.70114790311043728387e-12 + 0.32252476000000000000e-14 * t) * t) * t) * t) * t) * t; + } + case 13: { + T t = 2*y100 - 27; + return 0.21996459598282740954e-1 + (0.94040248155366777784e-3 + (0.53443911508041164739e-5 + (0.29078085538049374673e-7 + (0.15078844500329731137e-9 + (0.74103813647499204269e-12 + 0.34251892320000000000e-14 * t) * t) * t) * t) * t) * t; + } + case 14: { + T t = 2*y100 - 29; + return 0.23898877187226319502e-1 + (0.96213386835900177540e-3 + (0.55225386998049012752e-5 + (0.30314589961047687059e-7 + (0.15840826497296335264e-9 + (0.78340500472414454395e-12 + 0.36381553564444444445e-14 * t) * t) * t) * t) * t) * t; + } + case 15: { + T t = 2*y100 - 31; + return 0.25845480155298518485e-1 + (0.98459293067820123389e-3 + (0.57082915920051843672e-5 + (0.31613782169164830118e-7 + (0.16646478745529630813e-9 + (0.82840985928785407942e-12 + 0.38649975768888888890e-14 * t) * t) * t) * t) * t) * t; + } + case 16: { + T t = 2*y100 - 33; + return 0.27837754783474696598e-1 + (0.10078108563256892757e-2 + (0.59020366493792212221e-5 + (0.32979263553246520417e-7 + (0.17498524159268458073e-9 + (0.87622459124842525110e-12 + 0.41066206488888888890e-14 * t) * t) * t) * t) * t) * t; + } + case 17: { + T t = 2*y100 - 35; + return 0.29877251304899307550e-1 + (0.10318204245057349310e-2 + (0.61041829697162055093e-5 + (0.34414860359542720579e-7 + (0.18399863072934089607e-9 + (0.92703227366365046533e-12 + 0.43639844053333333334e-14 * t) * t) * t) * t) * t) * t; + } + case 18: { + T t = 2*y100 - 37; + return 0.31965587178596443475e-1 + (0.10566560976716574401e-2 + (0.63151633192414586770e-5 + (0.35924638339521924242e-7 + (0.19353584758781174038e-9 + (0.98102783859889264382e-12 + 0.46381060817777777779e-14 * t) * t) * t) * t) * t) * t; + } + case 19: { + T t = 2*y100 - 39; + return 0.34104450552588334840e-1 + (0.10823541191350532574e-2 + (0.65354356159553934436e-5 + (0.37512918348533521149e-7 + (0.20362979635817883229e-9 + (0.10384187833037282363e-11 + 0.49300625262222222221e-14 * t) * t) * t) * t) * t) * t; + } + case 20: { + T t = 2*y100 - 41; + return 0.36295603928292425716e-1 + (0.11089526167995268200e-2 + (0.67654845095518363577e-5 + (0.39184292949913591646e-7 + (0.21431552202133775150e-9 + (0.10994259106646731797e-11 + 0.52409949102222222221e-14 * t) * t) * t) * t) * t) * t; + } + case 21: { + T t = 2*y100 - 43; + return 0.38540888038840509795e-1 + (0.11364917134175420009e-2 + (0.70058230641246312003e-5 + (0.40943644083718586939e-7 + (0.22563034723692881631e-9 + (0.11642841011361992885e-11 + 0.55721092871111111110e-14 * t) * t) * t) * t) * t) * t; + } + case 22: { + T t = 2*y100 - 45; + return 0.40842225954785960651e-1 + (0.11650136437945673891e-2 + (0.72569945502343006619e-5 + (0.42796161861855042273e-7 + (0.23761401711005024162e-9 + (0.12332431172381557035e-11 + 0.59246802364444444445e-14 * t) * t) * t) * t) * t) * t; + } + case 23: { + T t = 2*y100 - 47; + return 0.43201627431540222422e-1 + (0.11945628793917272199e-2 + (0.75195743532849206263e-5 + (0.44747364553960993492e-7 + (0.25030885216472953674e-9 + (0.13065684400300476484e-11 + 0.63000532853333333334e-14 * t) * t) * t) * t) * t) * t; + } + case 24: { + T t = 2*y100 - 49; + return 0.45621193513810471438e-1 + (0.12251862608067529503e-2 + (0.77941720055551920319e-5 + (0.46803119830954460212e-7 + (0.26375990983978426273e-9 + (0.13845421370977119765e-11 + 0.66996477404444444445e-14 * t) * t) * t) * t) * t) * t; + } + case 25: { + T t = 2*y100 - 51; + return 0.48103121413299865517e-1 + (0.12569331386432195113e-2 + (0.80814333496367673980e-5 + (0.48969667335682018324e-7 + (0.27801515481905748484e-9 + (0.14674637611609884208e-11 + 0.71249589351111111110e-14 * t) * t) * t) * t) * t) * t; + } + case 26: { + T t = 2*y100 - 53; + return 0.50649709676983338501e-1 + (0.12898555233099055810e-2 + (0.83820428414568799654e-5 + (0.51253642652551838659e-7 + (0.29312563849675507232e-9 + (0.15556512782814827846e-11 + 0.75775607822222222221e-14 * t) * t) * t) * t) * t) * t; + } + case 27: { + T t = 2*y100 - 55; + return 0.53263363664388864181e-1 + (0.13240082443256975769e-2 + (0.86967260015007658418e-5 + (0.53662102750396795566e-7 + (0.30914568786634796807e-9 + (0.16494420240828493176e-11 + 0.80591079644444444445e-14 * t) * t) * t) * t) * t) * t; + } + case 28: { + T t = 2*y100 - 57; + return 0.55946601353500013794e-1 + (0.13594491197408190706e-2 + (0.90262520233016380987e-5 + (0.56202552975056695376e-7 + (0.32613310410503135996e-9 + (0.17491936862246367398e-11 + 0.85713381688888888890e-14 * t) * t) * t) * t) * t) * t; + } + case 29: { + T t = 2*y100 - 59; + return 0.58702059496154081813e-1 + (0.13962391363223647892e-2 + (0.93714365487312784270e-5 + (0.58882975670265286526e-7 + (0.34414937110591753387e-9 + (0.18552853109751857859e-11 + 0.91160736711111111110e-14 * t) * t) * t) * t) * t) * t; + } + case 30: { + T t = 2*y100 - 61; + return 0.61532500145144778048e-1 + (0.14344426411912015247e-2 + (0.97331446201016809696e-5 + (0.61711860507347175097e-7 + (0.36325987418295300221e-9 + (0.19681183310134518232e-11 + 0.96952238400000000000e-14 * t) * t) * t) * t) * t) * t; + } + case 31: { + T t = 2*y100 - 63; + return 0.64440817576653297993e-1 + (0.14741275456383131151e-2 + (0.10112293819576437838e-4 + (0.64698236605933246196e-7 + (0.38353412915303665586e-9 + (0.20881176114385120186e-11 + 0.10310784480000000000e-13 * t) * t) * t) * t) * t) * t; + } + case 32: { + T t = 2*y100 - 65; + return 0.67430045633130393282e-1 + (0.15153655418916540370e-2 + (0.10509857606888328667e-4 + (0.67851706529363332855e-7 + (0.40504602194811140006e-9 + (0.22157325110542534469e-11 + 0.10964842115555555556e-13 * t) * t) * t) * t) * t) * t; + } + case 33: { + T t = 2*y100 - 67; + return 0.70503365513338850709e-1 + (0.15582323336495709827e-2 + (0.10926868866865231089e-4 + (0.71182482239613507542e-7 + (0.42787405890153386710e-9 + (0.23514379522274416437e-11 + 0.11659571751111111111e-13 * t) * t) * t) * t) * t) * t; + } + case 34: { + T t = 2*y100 - 69; + return 0.73664114037944596353e-1 + (0.16028078812438820413e-2 + (0.11364423678778207991e-4 + (0.74701423097423182009e-7 + (0.45210162777476488324e-9 + (0.24957355004088569134e-11 + 0.12397238257777777778e-13 * t) * t) * t) * t) * t) * t; + } + case 35: { + T t = 2*y100 - 71; + return 0.76915792420819562379e-1 + (0.16491766623447889354e-2 + (0.11823685320041302169e-4 + (0.78420075993781544386e-7 + (0.47781726956916478925e-9 + (0.26491544403815724749e-11 + 0.13180196462222222222e-13 * t) * t) * t) * t) * t) * t; + } + case 36: { + T t = 2*y100 - 73; + return 0.80262075578094612819e-1 + (0.16974279491709504117e-2 + (0.12305888517309891674e-4 + (0.82350717698979042290e-7 + (0.50511496109857113929e-9 + (0.28122528497626897696e-11 + 0.14010889635555555556e-13 * t) * t) * t) * t) * t) * t; + } + case 37: { + T t = 2*y100 - 75; + return 0.83706822008980357446e-1 + (0.17476561032212656962e-2 + (0.12812343958540763368e-4 + (0.86506399515036435592e-7 + (0.53409440823869467453e-9 + (0.29856186620887555043e-11 + 0.14891851591111111111e-13 * t) * t) * t) * t) * t) * t; + } + case 38: { + T t = 2*y100 - 77; + return 0.87254084284461718231e-1 + (0.17999608886001962327e-2 + (0.13344443080089492218e-4 + (0.90900994316429008631e-7 + (0.56486134972616465316e-9 + (0.31698707080033956934e-11 + 0.15825697795555555556e-13 * t) * t) * t) * t) * t) * t; + } + case 39: { + T t = 2*y100 - 79; + return 0.90908120182172748487e-1 + (0.18544478050657699758e-2 + (0.13903663143426120077e-4 + (0.95549246062549906177e-7 + (0.59752787125242054315e-9 + (0.33656597366099099413e-11 + 0.16815130613333333333e-13 * t) * t) * t) * t) * t) * t; + } + case 40: { + T t = 2*y100 - 81; + return 0.94673404508075481121e-1 + (0.19112284419887303347e-2 + (0.14491572616545004930e-4 + (0.10046682186333613697e-6 + (0.63221272959791000515e-9 + (0.35736693975589130818e-11 + 0.17862931591111111111e-13 * t) * t) * t) * t) * t) * t; + } + case 41: { + T t = 2*y100 - 83; + return 0.98554641648004456555e-1 + (0.19704208544725622126e-2 + (0.15109836875625443935e-4 + (0.10567036667675984067e-6 + (0.66904168640019354565e-9 + (0.37946171850824333014e-11 + 0.18971959040000000000e-13 * t) * t) * t) * t) * t) * t; + } + case 42: { + T t = 2*y100 - 85; + return 0.10255677889470089531e0 + (0.20321499629472857418e-2 + (0.15760224242962179564e-4 + (0.11117756071353507391e-6 + (0.70814785110097658502e-9 + (0.40292553276632563925e-11 + 0.20145143075555555556e-13 * t) * t) * t) * t) * t) * t; + } + case 43: { + T t = 2*y100 - 87; + return 0.10668502059865093318e0 + (0.20965479776148731610e-2 + (0.16444612377624983565e-4 + (0.11700717962026152749e-6 + (0.74967203250938418991e-9 + (0.42783716186085922176e-11 + 0.21385479360000000000e-13 * t) * t) * t) * t) * t) * t; + } + case 44: { + T t = 2*y100 - 89; + return 0.11094484319386444474e0 + (0.21637548491908170841e-2 + (0.17164995035719657111e-4 + (0.12317915750735938089e-6 + (0.79376309831499633734e-9 + (0.45427901763106353914e-11 + 0.22696025653333333333e-13 * t) * t) * t) * t) * t) * t; + } + case 45: { + T t = 2*y100 - 91; + return 0.11534201115268804714e0 + (0.22339187474546420375e-2 + (0.17923489217504226813e-4 + (0.12971465288245997681e-6 + (0.84057834180389073587e-9 + (0.48233721206418027227e-11 + 0.24079890062222222222e-13 * t) * t) * t) * t) * t) * t; + } + case 46: { + T t = 2*y100 - 93; + return 0.11988259392684094740e0 + (0.23071965691918689601e-2 + (0.18722342718958935446e-4 + (0.13663611754337957520e-6 + (0.89028385488493287005e-9 + (0.51210161569225846701e-11 + 0.25540227111111111111e-13 * t) * t) * t) * t) * t) * t; + } + case 47: { + T t = 2*y100 - 95; + return 0.12457298393509812907e0 + (0.23837544771809575380e-2 + (0.19563942105711612475e-4 + (0.14396736847739470782e-6 + (0.94305490646459247016e-9 + (0.54366590583134218096e-11 + 0.27080225920000000000e-13 * t) * t) * t) * t) * t) * t; + } + case 48: { + T t = 2*y100 - 97; + return 0.12941991566142438816e0 + (0.24637684719508859484e-2 + (0.20450821127475879816e-4 + (0.15173366280523906622e-6 + (0.99907632506389027739e-9 + (0.57712760311351625221e-11 + 0.28703099555555555556e-13 * t) * t) * t) * t) * t) * t; + } + case 49: { + T t = 2*y100 - 99; + return 0.13443048593088696613e0 + (0.25474249981080823877e-2 + (0.21385669591362915223e-4 + (0.15996177579900443030e-6 + (0.10585428844575134013e-8 + (0.61258809536787882989e-11 + 0.30412080142222222222e-13 * t) * t) * t) * t) * t) * t; + } + case 50: { + T t = 2*y100 - 101; + return 0.13961217543434561353e0 + (0.26349215871051761416e-2 + (0.22371342712572567744e-4 + (0.16868008199296822247e-6 + (0.11216596910444996246e-8 + (0.65015264753090890662e-11 + 0.32210394506666666666e-13 * t) * t) * t) * t) * t) * t; + } + case 51: { + T t = 2*y100 - 103; + return 0.14497287157673800690e0 + (0.27264675383982439814e-2 + (0.23410870961050950197e-4 + (0.17791863939526376477e-6 + (0.11886425714330958106e-8 + (0.68993039665054288034e-11 + 0.34101266222222222221e-13 * t) * t) * t) * t) * t) * t; + } + case 52: { + T t = 2*y100 - 105; + return 0.15052089272774618151e0 + (0.28222846410136238008e-2 + (0.24507470422713397006e-4 + (0.18770927679626136909e-6 + (0.12597184587583370712e-8 + (0.73203433049229821618e-11 + 0.36087889048888888890e-13 * t) * t) * t) * t) * t) * t; + } + case 53: { + T t = 2*y100 - 107; + return 0.15626501395774612325e0 + (0.29226079376196624949e-2 + (0.25664553693768450545e-4 + (0.19808568415654461964e-6 + (0.13351257759815557897e-8 + (0.77658124891046760667e-11 + 0.38173420035555555555e-13 * t) * t) * t) * t) * t) * t; + } + case 54: { + T t = 2*y100 - 109; + return 0.16221449434620737567e0 + (0.30276865332726475672e-2 + (0.26885741326534564336e-4 + (0.20908350604346384143e-6 + (0.14151148144240728728e-8 + (0.82369170665974313027e-11 + 0.40360957457777777779e-13 * t) * t) * t) * t) * t) * t; + } + case 55: { + T t = 2*y100 - 111; + return 0.16837910595412130659e0 + (0.31377844510793082301e-2 + (0.28174873844911175026e-4 + (0.22074043807045782387e-6 + (0.14999481055996090039e-8 + (0.87348993661930809254e-11 + 0.42653528977777777779e-13 * t) * t) * t) * t) * t) * t; + } + case 56: { + T t = 2*y100 - 113; + return 0.17476916455659369953e0 + (0.32531815370903068316e-2 + (0.29536024347344364074e-4 + (0.23309632627767074202e-6 + (0.15899007843582444846e-8 + (0.92610375235427359475e-11 + 0.45054073102222222221e-13 * t) * t) * t) * t) * t) * t; + } + case 57: { + T t = 2*y100 - 115; + return 0.18139556223643701364e0 + (0.33741744168096996041e-2 + (0.30973511714709500836e-4 + (0.24619326937592290996e-6 + (0.16852609412267750744e-8 + (0.98166442942854895573e-11 + 0.47565418097777777779e-13 * t) * t) * t) * t) * t) * t; + } + case 58: { + T t = 2*y100 - 117; + return 0.18826980194443664549e0 + (0.35010775057740317997e-2 + (0.32491914440014267480e-4 + (0.26007572375886319028e-6 + (0.17863299617388376116e-8 + (0.10403065638343878679e-10 + 0.50190265831111111110e-13 * t) * t) * t) * t) * t) * t; + } + case 59: { + T t = 2*y100 - 119; + return 0.19540403413693967350e0 + (0.36342240767211326315e-2 + (0.34096085096200907289e-4 + (0.27479061117017637474e-6 + (0.18934228504790032826e-8 + (0.11021679075323598664e-10 + 0.52931171733333333334e-13 * t) * t) * t) * t) * t) * t; + } + case 60: { + T t = 2*y100 - 121; + return 0.20281109560651886959e0 + (0.37739673859323597060e-2 + (0.35791165457592409054e-4 + (0.29038742889416172404e-6 + (0.20068685374849001770e-8 + (0.11673891799578381999e-10 + 0.55790523093333333334e-13 * t) * t) * t) * t) * t) * t; + } + case 61: { + T t = 2*y100 - 123; + return 0.21050455062669334978e0 + (0.39206818613925652425e-2 + (0.37582602289680101704e-4 + (0.30691836231886877385e-6 + (0.21270101645763677824e-8 + (0.12361138551062899455e-10 + 0.58770520160000000000e-13 * t) * t) * t) * t) * t) * t; + } + case 62: { + T t = 2*y100 - 125; + return 0.21849873453703332479e0 + (0.40747643554689586041e-2 + (0.39476163820986711501e-4 + (0.32443839970139918836e-6 + (0.22542053491518680200e-8 + (0.13084879235290858490e-10 + 0.61873153262222222221e-13 * t) * t) * t) * t) * t) * t; + } + case 63: { + T t = 2*y100 - 127; + return 0.22680879990043229327e0 + (0.42366354648628516935e-2 + (0.41477956909656896779e-4 + (0.34300544894502810002e-6 + (0.23888264229264067658e-8 + (0.13846596292818514601e-10 + 0.65100183751111111110e-13 * t) * t) * t) * t) * t) * t; + } + case 64: { + T t = 2*y100 - 129; + return 0.23545076536988703937e0 + (0.44067409206365170888e-2 + (0.43594444916224700881e-4 + (0.36268045617760415178e-6 + (0.25312606430853202748e-8 + (0.14647791812837903061e-10 + 0.68453122631111111110e-13 * t) * t) * t) * t) * t) * t; + } + case 65: { + T t = 2*y100 - 131; + return 0.24444156740777432838e0 + (0.45855530511605787178e-2 + (0.45832466292683085475e-4 + (0.38352752590033030472e-6 + (0.26819103733055603460e-8 + (0.15489984390884756993e-10 + 0.71933206364444444445e-13 * t) * t) * t) * t) * t) * t; + } + case 66: { + T t = 2*y100 - 133; + return 0.25379911500634264643e0 + (0.47735723208650032167e-2 + (0.48199253896534185372e-4 + (0.40561404245564732314e-6 + (0.28411932320871165585e-8 + (0.16374705736458320149e-10 + 0.75541379822222222221e-13 * t) * t) * t) * t) * t) * t; + } + case 67: { + T t = 2*y100 - 135; + return 0.26354234756393613032e0 + (0.49713289477083781266e-2 + (0.50702455036930367504e-4 + (0.42901079254268185722e-6 + (0.30095422058900481753e-8 + (0.17303497025347342498e-10 + 0.79278273368888888890e-13 * t) * t) * t) * t) * t) * t; + } + case 68: { + T t = 2*y100 - 137; + return 0.27369129607732343398e0 + (0.51793846023052643767e-2 + (0.53350152258326602629e-4 + (0.45379208848865015485e-6 + (0.31874057245814381257e-8 + (0.18277905010245111046e-10 + 0.83144182364444444445e-13 * t) * t) * t) * t) * t) * t; + } + case 69: { + T t = 2*y100 - 139; + return 0.28426714781640316172e0 + (0.53983341916695141966e-2 + (0.56150884865255810638e-4 + (0.48003589196494734238e-6 + (0.33752476967570796349e-8 + (0.19299477888083469086e-10 + 0.87139049137777777779e-13 * t) * t) * t) * t) * t) * t; + } + case 70: { + T t = 2*y100 - 141; + return 0.29529231465348519920e0 + (0.56288077305420795663e-2 + (0.59113671189913307427e-4 + (0.50782393781744840482e-6 + (0.35735475025851713168e-8 + (0.20369760937017070382e-10 + 0.91262442613333333334e-13 * t) * t) * t) * t) * t) * t; + } + case 71: { + T t = 2*y100 - 143; + return 0.30679050522528838613e0 + (0.58714723032745403331e-2 + (0.62248031602197686791e-4 + (0.53724185766200945789e-6 + (0.37827999418960232678e-8 + (0.21490291930444538307e-10 + 0.95513539182222222221e-13 * t) * t) * t) * t) * t) * t; + } + case 72: { + T t = 2*y100 - 145; + return 0.31878680111173319425e0 + (0.61270341192339103514e-2 + (0.65564012259707640976e-4 + (0.56837930287837738996e-6 + (0.40035151353392378882e-8 + (0.22662596341239294792e-10 + 0.99891109760000000000e-13 * t) * t) * t) * t) * t) * t; + } + case 73: { + T t = 2*y100 - 147; + return 0.33130773722152622027e0 + (0.63962406646798080903e-2 + (0.69072209592942396666e-4 + (0.60133006661885941812e-6 + (0.42362183765883466691e-8 + (0.23888182347073698382e-10 + 0.10439349811555555556e-12 * t) * t) * t) * t) * t) * t; + } + case 74: { + T t = 2*y100 - 149; + return 0.34438138658041336523e0 + (0.66798829540414007258e-2 + (0.72783795518603561144e-4 + (0.63619220443228800680e-6 + (0.44814499336514453364e-8 + (0.25168535651285475274e-10 + 0.10901861383111111111e-12 * t) * t) * t) * t) * t) * t; + } + case 75: { + T t = 2*y100 - 151; + return 0.35803744972380175583e0 + (0.69787978834882685031e-2 + (0.76710543371454822497e-4 + (0.67306815308917386747e-6 + (0.47397647975845228205e-8 + (0.26505114141143050509e-10 + 0.11376390933333333333e-12 * t) * t) * t) * t) * t) * t; + } + case 76: { + T t = 2*y100 - 153; + return 0.37230734890119724188e0 + (0.72938706896461381003e-2 + (0.80864854542670714092e-4 + (0.71206484718062688779e-6 + (0.50117323769745883805e-8 + (0.27899342394100074165e-10 + 0.11862637614222222222e-12 * t) * t) * t) * t) * t) * t; + } + case 77: { + T t = 2*y100 - 155; + return 0.38722432730555448223e0 + (0.76260375162549802745e-2 + (0.85259785810004603848e-4 + (0.75329383305171327677e-6 + (0.52979361368388119355e-8 + (0.29352606054164086709e-10 + 0.12360253370666666667e-12 * t) * t) * t) * t) * t) * t; + } + case 78: { + T t = 2*y100 - 157; + return 0.40282355354616940667e0 + (0.79762880915029728079e-2 + (0.89909077342438246452e-4 + (0.79687137961956194579e-6 + (0.55989731807360403195e-8 + (0.30866246101464869050e-10 + 0.12868841946666666667e-12 * t) * t) * t) * t) * t) * t; + } + case 79: { + T t = 2*y100 - 159; + return 0.41914223158913787649e0 + (0.83456685186950463538e-2 + (0.94827181359250161335e-4 + (0.84291858561783141014e-6 + (0.59154537751083485684e-8 + (0.32441553034347469291e-10 + 0.13387957943111111111e-12 * t) * t) * t) * t) * t) * t; + } + case 80: { + T t = 2*y100 - 161; + return 0.43621971639463786896e0 + (0.87352841828289495773e-2 + (0.10002929142066799966e-3 + (0.89156148280219880024e-6 + (0.62480008150788597147e-8 + (0.34079760983458878910e-10 + 0.13917107176888888889e-12 * t) * t) * t) * t) * t) * t; + } + case 81: { + T t = 2*y100 - 163; + return 0.45409763548534330981e0 + (0.91463027755548240654e-2 + (0.10553137232446167258e-3 + (0.94293113464638623798e-6 + (0.65972492312219959885e-8 + (0.35782041795476563662e-10 + 0.14455745872000000000e-12 * t) * t) * t) * t) * t) * t; + } + case 82: { + T t = 2*y100 - 165; + return 0.47282001668512331468e0 + (0.95799574408860463394e-2 + (0.11135019058000067469e-3 + (0.99716373005509038080e-6 + (0.69638453369956970347e-8 + (0.37549499088161345850e-10 + 0.15003280712888888889e-12 * t) * t) * t) * t) * t) * t; + } + case 83: { + T t = 2*y100 - 167; + return 0.49243342227179841649e0 + (0.10037550043909497071e-1 + (0.11750334542845234952e-3 + (0.10544006716188967172e-5 + (0.73484461168242224872e-8 + (0.39383162326435752965e-10 + 0.15559069118222222222e-12 * t) * t) * t) * t) * t) * t; + } + case 84: { + T t = 2*y100 - 169; + return 0.51298708979209258326e0 + (0.10520454564612427224e-1 + (0.12400930037494996655e-3 + (0.11147886579371265246e-5 + (0.77517184550568711454e-8 + (0.41283980931872622611e-10 + 0.16122419680000000000e-12 * t) * t) * t) * t) * t) * t; + } + case 85: { + T t = 2*y100 - 171; + return 0.53453307979101369843e0 + (0.11030120618800726938e-1 + (0.13088741519572269581e-3 + (0.11784797595374515432e-5 + (0.81743383063044825400e-8 + (0.43252818449517081051e-10 + 0.16692592640000000000e-12 * t) * t) * t) * t) * t) * t; + } + case 86: { + T t = 2*y100 - 173; + return 0.55712643071169299478e0 + (0.11568077107929735233e-1 + (0.13815797838036651289e-3 + (0.12456314879260904558e-5 + (0.86169898078969313597e-8 + (0.45290446811539652525e-10 + 0.17268801084444444444e-12 * t) * t) * t) * t) * t) * t; + } + case 87: { + T t = 2*y100 - 175; + return 0.58082532122519320968e0 + (0.12135935999503877077e-1 + (0.14584223996665838559e-3 + (0.13164068573095710742e-5 + (0.90803643355106020163e-8 + (0.47397540713124619155e-10 + 0.17850211608888888889e-12 * t) * t) * t) * t) * t) * t; + } + case 88: { + T t = 2*y100 - 177; + return 0.60569124025293375554e0 + (0.12735396239525550361e-1 + (0.15396244472258863344e-3 + (0.13909744385382818253e-5 + (0.95651595032306228245e-8 + (0.49574672127669041550e-10 + 0.18435945564444444444e-12 * t) * t) * t) * t) * t) * t; + } + case 89: { + T t = 2*y100 - 179; + return 0.63178916494715716894e0 + (0.13368247798287030927e-1 + (0.16254186562762076141e-3 + (0.14695084048334056083e-5 + (0.10072078109604152350e-7 + (0.51822304995680707483e-10 + 0.19025081422222222222e-12 * t) * t) * t) * t) * t) * t; + } + case 90: { + T t = 2*y100 - 181; + return 0.65918774689725319200e0 + (0.14036375850601992063e-1 + (0.17160483760259706354e-3 + (0.15521885688723188371e-5 + (0.10601827031535280590e-7 + (0.54140790105837520499e-10 + 0.19616655146666666667e-12 * t) * t) * t) * t) * t) * t; + } + case 91: { + T t = 2*y100 - 183; + return 0.68795950683174433822e0 + (0.14741765091365869084e-1 + (0.18117679143520433835e-3 + (0.16392004108230585213e-5 + (0.11155116068018043001e-7 + (0.56530360194925690374e-10 + 0.20209663662222222222e-12 * t) * t) * t) * t) * t) * t; + } + case 92: { + T t = 2*y100 - 185; + return 0.71818103808729967036e0 + (0.15486504187117112279e-1 + (0.19128428784550923217e-3 + (0.17307350969359975848e-5 + (0.11732656736113607751e-7 + (0.58991125287563833603e-10 + 0.20803065333333333333e-12 * t) * t) * t) * t) * t) * t; + } + case 93: { + T t = 2*y100 - 187; + return 0.74993321911726254661e0 + (0.16272790364044783382e-1 + (0.20195505163377912645e-3 + (0.18269894883203346953e-5 + (0.12335161021630225535e-7 + (0.61523068312169087227e-10 + 0.21395783431111111111e-12 * t) * t) * t) * t) * t) * t; + } + case 94: { + T t = 2*y100 - 189; + return 0.78330143531283492729e0 + (0.17102934132652429240e-1 + (0.21321800585063327041e-3 + (0.19281661395543913713e-5 + (0.12963340087354341574e-7 + (0.64126040998066348872e-10 + 0.21986708942222222222e-12 * t) * t) * t) * t) * t) * t; + } + case 95: { + T t = 2*y100 - 191; + return 0.81837581041023811832e0 + (0.17979364149044223802e-1 + (0.22510330592753129006e-3 + (0.20344732868018175389e-5 + (0.13617902941839949718e-7 + (0.66799760083972474642e-10 + 0.22574701262222222222e-12 * t) * t) * t) * t) * t) * t; + } + case 96: { + T t = 2*y100 - 193; + return 0.85525144775685126237e0 + (0.18904632212547561026e-1 + (0.23764237370371255638e-3 + (0.21461248251306387979e-5 + (0.14299555071870523786e-7 + (0.69543803864694171934e-10 + 0.23158593688888888889e-12 * t) * t) * t) * t) * t) * t; + } + case 97: { + T t = 2*y100 - 195; + return 0.89402868170849933734e0 + (0.19881418399127202569e-1 + (0.25086793128395995798e-3 + (0.22633402747585233180e-5 + (0.15008997042116532283e-7 + (0.72357609075043941261e-10 + 0.23737194737777777778e-12 * t) * t) * t) * t) * t) * t; + } + case 98: { + T t = 2*y100 - 197; + return 0.93481333942870796363e0 + (0.20912536329780368893e-1 + (0.26481403465998477969e-3 + (0.23863447359754921676e-5 + (0.15746923065472184451e-7 + (0.75240468141720143653e-10 + 0.24309291271111111111e-12 * t) * t) * t) * t) * t) * t; + } + case 99: { + T t = 2*y100 - 199; + return 0.97771701335885035464e0 + (0.22000938572830479551e-1 + (0.27951610702682383001e-3 + (0.25153688325245314530e-5 + (0.16514019547822821453e-7 + (0.78191526829368231251e-10 + 0.24873652355555555556e-12 * t) * t) * t) * t) * t) * t; + } + } + + // we only get here if y = 1, i.e. |x| < 4*eps, in which case + // erfcx is within 1e-15 of 1.. + return 1.; + } + + template + T erfcx(T x) { + // Short-circuits on NaN (returning NaN) + if (x != x) { + return x; + } + + if (x >= 0) { + if (x > T{50}) { // continued-fraction expansion is faster + const T ispi = 0.56418958354775628694807945156; // 1 / sqrt(pi) + + if (x > T{5e7}) { // 1-term expansion, important to avoid overflow + return ispi / x; + } + + /* 5-term expansion (rely on compiler for CSE), simplified from: + ispi / (x+0.5/(x+1/(x+1.5/(x+2/x)))) */ + return ispi * ((x*x) * (x*x+T{4.5}) + T{2}) / (x * ((x*x) * (x*x+T{5}) + T{3.75})); + } + + // x >= 0 x <= 50 + return erfcx_y100(T{400} / (T{4} + x)); + } + + // x < 0 + if (x < T{-26.7}) { + return POS_INFINITY; + } else if (x < T{-6.1}) { + return T{2} * exp(x * x); + } + + // x < 0 and x >= -6.1 + return T{2} * exp(x * x) - erfcx_y100(T{400} / (T{4} - x)); + } +); // erfcx_string + +const auto airy_ai_string = jiterator_stringify( + template + T airy_ai_forward(T x) { + static const T AN[] = { + +3.46538101525629032477e-01, + +1.20075952739645805542e+01, + +7.62796053615234516538e+01, + +1.68089224934630576269e+02, + +1.59756391350164413639e+02, + +7.05360906840444183113e+01, + +1.40264691163389668864e+01, + +9.99999999999999995305e-01, + }; + + static const T AD[] = { + +5.67594532638770212846e-01, + +1.47562562584847203173e+01, + +8.45138970141474626562e+01, + +1.77318088145400459522e+02, + +1.64234692871529701831e+02, + +7.14778400825575695274e+01, + +1.40959135607834029598e+01, + +1.00000000000000000470e+00, + }; + + static const T AFN[] = { + -1.31696323418331795333e-01, + -6.26456544431912369773e-01, + -6.93158036036933542233e-01, + -2.79779981545119124951e-01, + -4.91900132609500318020e-02, + -4.06265923594885404393e-03, + -1.59276496239262096340e-04, + -2.77649108155232920844e-06, + -1.67787698489114633780e-08, + }; + + static const T AFD[] = { + +1.33560420706553243746e+01, + +3.26825032795224613948e+01, + +2.67367040941499554804e+01, + +9.18707402907259625840e+00, + +1.47529146771666414581e+00, + +1.15687173795188044134e-01, + +4.40291641615211203805e-03, + +7.54720348287414296618e-05, + +4.51850092970580378464e-07, + }; + + static const T AGN[] = { + +1.97339932091685679179e-02, + +3.91103029615688277255e-01, + +1.06579897599595591108e+00, + +9.39169229816650230044e-01, + +3.51465656105547619242e-01, + +6.33888919628925490927e-02, + +5.85804113048388458567e-03, + +2.82851600836737019778e-04, + +6.98793669997260967291e-06, + +8.11789239554389293311e-08, + +3.41551784765923618484e-10, + }; + + static const T AGD[] = { + +9.30892908077441974853e+00, + +1.98352928718312140417e+01, + +1.55646628932864612953e+01, + +5.47686069422975497931e+00, + +9.54293611618961883998e-01, + +8.64580826352392193095e-02, + +4.12656523824222607191e-03, + +1.01259085116509135510e-04, + +1.17166733214413521882e-06, + +4.91834570062930015649e-09, + }; + + int domain_flag = 0; + + T ai; + + if (isinf(x)) { + return NAN; + } + + if (x > T(103.892)) { + return T(0.0); + } + + T f; + T g; + T k; + + if (x < T(-2.09)) { + T z = T(1.0) / (T(-2.0) * x * sqrt(-x) / T(3.0)); + + T afn = 0.0; + + for (uint8_t index = 0; index <= 8; index++) { + afn = afn * (z * z) + AFN[index]; + } + + T afd = 0.0; + + for (uint8_t index = 0; index <= 8; index++) { + afd = afd * (z * z) + AFD[index]; + } + + T agn = 0.0; + + for (uint8_t index = 0; index <= 10 + 0; index++) { + agn = agn * (z * z) + AGN[index]; + } + + T agd = 0.0; + + for (uint8_t index = 0; index <= 10 - 1; index++) { + agd = agd * (z * z) + AGD[index]; + } + + T t = T(-2.0) * x * sqrt(-x) / T(3.0) + T(0.25) * T(3.14159265358979323846); + + return T(5.64189583547756286948e-01) / sqrt(sqrt(-x)) * (sin(t) * (T(1.0) + z * z * afn / afd) - cos(t) * (z * agn / agd)); + } + + if (x >= T(2.09)) { + domain_flag = 5; + + T zeta = T(2.0) * x * sqrt(x) / T(3.0); + + T an = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + an = an * (T(1.0) / zeta) + AN[index]; + } + + T ad = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + ad = ad * (T(1.0) / zeta) + AD[index]; + } + + ai = T(5.64189583547756286948e-01) * (an / ad) / (T(2.0) * sqrt(sqrt(x)) * exp(zeta)); + + if (x > T(8.3203353)) { + return ai; + } + } + + f = 1.0; + g = x; + k = 1.0; + + T m = 1.0; + T n = x; + T t = 1.0; + T z = x * x * x; + + while (t > T(1.11022302462515654042e-16)) { + m *= z; + k += T(1.0); + m /= k; + n *= z; + k += T(1.0); + n /= k; + m /= k; + f += m; + k += T(1.0); + n /= k; + g += n; + + t = abs(m / f); + } + + if ((domain_flag & 1) == 0) { + return T(0.355028053887817239260) * f - T(0.258819403792806798405) * g; + } + + return ai; + } // T airy_ai(T x) +); // airy_ai_string + +const auto bessel_j0_string = jiterator_stringify( + template + T bessel_j0_forward(T x) { + static const T PP[] = { + +7.96936729297347051624e-04, + +8.28352392107440799803e-02, + +1.23953371646414299388e+00, + +5.44725003058768775090e+00, + +8.74716500199817011941e+00, + +5.30324038235394892183e+00, + +9.99999999999999997821e-01, + }; + + static const T PQ[] = { + +9.24408810558863637013e-04, + +8.56288474354474431428e-02, + +1.25352743901058953537e+00, + +5.47097740330417105182e+00, + +8.76190883237069594232e+00, + +5.30605288235394617618e+00, + +1.00000000000000000218e+00, + }; + + static const T QP[] = { + -1.13663838898469149931e-02, + -1.28252718670509318512e+00, + -1.95539544257735972385e+01, + -9.32060152123768231369e+01, + -1.77681167980488050595e+02, + -1.47077505154951170175e+02, + -5.14105326766599330220e+01, + -6.05014350600728481186e+00, + }; + + static const T QQ[] = { + +6.43178256118178023184e+01, + +8.56430025976980587198e+02, + +3.88240183605401609683e+03, + +7.24046774195652478189e+03, + +5.93072701187316984827e+03, + +2.06209331660327847417e+03, + +2.42005740240291393179e+02, + }; + + static const T RP[] = { + -4.79443220978201773821e+09, + +1.95617491946556577543e+12, + -2.49248344360967716204e+14, + +9.70862251047306323952e+15, + }; + + static const T RQ[] = { + +4.99563147152651017219e+02, + +1.73785401676374683123e+05, + +4.84409658339962045305e+07, + +1.11855537045356834862e+10, + +2.11277520115489217587e+12, + +3.10518229857422583814e+14, + +3.18121955943204943306e+16, + +1.71086294081043136091e+18, + }; + + if (x < T(0)) { + x = -x; + } + + if (x <= T(5.0)) { + if (x < T(0.00001)) { + return T(1.0) - x * x / T(4.0); + } + + T rp = 0.0; + + for (uint8_t index = 0; index <= 3; index++) { + rp = rp * (x * x) + RP[index]; + } + + T rq = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + rq = rq * (x * x) + RQ[index]; + } + + return (x * x - T(5.78318596294678452118e+00)) * (x * x - T(3.04712623436620863991e+01)) * rp / rq; + } + + T pp = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + pp = pp * (T(25.0) / (x * x)) + PP[index]; + } + + T pq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + pq = pq * (T(25.0) / (x * x)) + PQ[index]; + } + + T qp = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + qp = qp * (T(25.0) / (x * x)) + QP[index]; + } + + T qq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + qq = qq * (T(25.0) / (x * x)) + QQ[index]; + } + + return (pp / pq * cos(x - T(0.785398163397448309615660845819875721)) - T(5.0) / x * (qp / qq) * sin(x - T(0.785398163397448309615660845819875721))) * T(0.797884560802865355879892119868763737) / sqrt(x); + } // bessel_j0_forward(T x) +); // bessel_j0_string + +const auto bessel_y0_string = bessel_j0_string + jiterator_stringify( + template + T bessel_y0_forward(T x) { + static const T PP[] = { + +7.96936729297347051624e-04, + +8.28352392107440799803e-02, + +1.23953371646414299388e+00, + +5.44725003058768775090e+00, + +8.74716500199817011941e+00, + +5.30324038235394892183e+00, + +9.99999999999999997821e-01, + }; + + static const T PQ[] = { + +9.24408810558863637013e-04, + +8.56288474354474431428e-02, + +1.25352743901058953537e+00, + +5.47097740330417105182e+00, + +8.76190883237069594232e+00, + +5.30605288235394617618e+00, + +1.00000000000000000218e+00, + }; + + static const T QP[] = { + -1.13663838898469149931e-02, + -1.28252718670509318512e+00, + -1.95539544257735972385e+01, + -9.32060152123768231369e+01, + -1.77681167980488050595e+02, + -1.47077505154951170175e+02, + -5.14105326766599330220e+01, + -6.05014350600728481186e+00, + }; + + static const T QQ[] = { + +6.43178256118178023184e+01, + +8.56430025976980587198e+02, + +3.88240183605401609683e+03, + +7.24046774195652478189e+03, + +5.93072701187316984827e+03, + +2.06209331660327847417e+03, + +2.42005740240291393179e+02, + }; + + static const T YP[] = { + +1.55924367855235737965e+04, + -1.46639295903971606143e+07, + +5.43526477051876500413e+09, + -9.82136065717911466409e+11, + +8.75906394395366999549e+13, + -3.46628303384729719441e+15, + +4.42733268572569800351e+16, + -1.84950800436986690637e+16, + }; + + static const T YQ[] = { + +1.04128353664259848412e+03, + +6.26107330137134956842e+05, + +2.68919633393814121987e+08, + +8.64002487103935000337e+10, + +2.02979612750105546709e+13, + +3.17157752842975028269e+15, + +2.50596256172653059228e+17, + }; + + if (x <= T(5.0)) { + if (x == T(0.0)) { + return NEG_INFINITY; + } + + if (x < T(0.0)) { + NAN; + } + + T yp = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + yp = yp * (x * x) + YP[index]; + } + + T yq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + yq = yq * (x * x) + YQ[index]; + } + + return yp / yq + (T(0.636619772367581343075535053490057448) * log(x) * bessel_j0_forward(x)); + } + + T pp = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + pp = pp * (T(25.0) / (x * x)) + PP[index]; + } + + T pq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + pq = pq * (T(25.0) / (x * x)) + PQ[index]; + } + + T qp = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + qp = qp * (T(25.0) / (x * x)) + QP[index]; + } + + T qq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + qq = qq * (T(25.0) / (x * x)) + QQ[index]; + } + + return (pp / pq * sin(x - T(0.785398163397448309615660845819875721)) + T(5.0) / x * (qp / qq) * cos(x - T(0.785398163397448309615660845819875721))) * T(0.797884560802865355879892119868763737) / sqrt(x); + } // bessel_y0_forward(T x) +); // bessel_y0_string + +const auto bessel_j1_string = jiterator_stringify( + template + T bessel_j1_forward(T x) { + static const T PP[] = { + +7.62125616208173112003e-04, + +7.31397056940917570436e-02, + +1.12719608129684925192e+00, + +5.11207951146807644818e+00, + +8.42404590141772420927e+00, + +5.21451598682361504063e+00, + +1.00000000000000000254e+00, + }; + + static const T PQ[] = { + +5.71323128072548699714e-04, + +6.88455908754495404082e-02, + +1.10514232634061696926e+00, + +5.07386386128601488557e+00, + +8.39985554327604159757e+00, + +5.20982848682361821619e+00, + +9.99999999999999997461e-01, + }; + + static const T QP[] = { + +5.10862594750176621635e-02, + +4.98213872951233449420e+00, + +7.58238284132545283818e+01, + +3.66779609360150777800e+02, + +7.10856304998926107277e+02, + +5.97489612400613639965e+02, + +2.11688757100572135698e+02, + +2.52070205858023719784e+01, + }; + + static const T QQ[] = { + +7.42373277035675149943e+01, + +1.05644886038262816351e+03, + +4.98641058337653607651e+03, + +9.56231892404756170795e+03, + +7.99704160447350683650e+03, + +2.82619278517639096600e+03, + +3.36093607810698293419e+02, + }; + + static const T RP[] = { + -8.99971225705559398224e+08, + +4.52228297998194034323e+11, + -7.27494245221818276015e+13, + +3.68295732863852883286e+15, + }; + + static const T RQ[] = { + +6.20836478118054335476e+02, + +2.56987256757748830383e+05, + +8.35146791431949253037e+07, + +2.21511595479792499675e+10, + +4.74914122079991414898e+12, + +7.84369607876235854894e+14, + +8.95222336184627338078e+16, + +5.32278620332680085395e+18, + }; + + if (x < T(0.0)) { + return -bessel_j1_forward(-x); + } + + if (x <= T(5.0)) { + T rp = 0.0; + + for (uint8_t index = 0; index <= 3; index++) { + rp = rp * (x * x) + RP[index]; + } + + T rq = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + rq = rq * (x * x) + RQ[index]; + } + + return rp / rq * x * (x * x - T(1.46819706421238932572e+01)) * (x * x - T(4.92184563216946036703e+01)); + } + + T pp = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + pp = pp * (T(5.0) / x * (T(5.0) / x)) + PP[index]; + } + + T pq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + pq = pq * (T(5.0) / x * (T(5.0) / x)) + PQ[index]; + } + + T qp = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + qp = qp * (T(5.0) / x * (T(5.0) / x)) + QP[index]; + } + + T qq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + qq = qq * (T(5.0) / x * (T(5.0) / x)) + QQ[index]; + } + + return (pp / pq * cos(x - T(2.356194490192344928846982537459627163)) - T(5.0) / x * (qp / qq) * sin(x - T(2.356194490192344928846982537459627163))) * T(0.797884560802865355879892119868763737) / sqrt(x); + } // bessel_j1_forward(T x) +); // bessel_j1_string + +const auto bessel_y1_string = bessel_j1_string + jiterator_stringify( + template + T bessel_y1_forward(T x) { + static const T PP[] = { + +7.62125616208173112003e-04, + +7.31397056940917570436e-02, + +1.12719608129684925192e+00, + +5.11207951146807644818e+00, + +8.42404590141772420927e+00, + +5.21451598682361504063e+00, + +1.00000000000000000254e+00, + }; + + static const T PQ[] = { + +5.71323128072548699714e-04, + +6.88455908754495404082e-02, + +1.10514232634061696926e+00, + +5.07386386128601488557e+00, + +8.39985554327604159757e+00, + +5.20982848682361821619e+00, + +9.99999999999999997461e-01, + }; + + static const T QP[] = { + +5.10862594750176621635e-02, + +4.98213872951233449420e+00, + +7.58238284132545283818e+01, + +3.66779609360150777800e+02, + +7.10856304998926107277e+02, + +5.97489612400613639965e+02, + +2.11688757100572135698e+02, + +2.52070205858023719784e+01, + }; + + static const T QQ[] = { + +7.42373277035675149943e+01, + +1.05644886038262816351e+03, + +4.98641058337653607651e+03, + +9.56231892404756170795e+03, + +7.99704160447350683650e+03, + +2.82619278517639096600e+03, + +3.36093607810698293419e+02, + }; + + static const T YP[] = { + +1.26320474790178026440e+09, + -6.47355876379160291031e+11, + +1.14509511541823727583e+14, + -8.12770255501325109621e+15, + +2.02439475713594898196e+17, + -7.78877196265950026825e+17, + }; + + static const T YQ[] = { + +5.94301592346128195359e+02, + +2.35564092943068577943e+05, + +7.34811944459721705660e+07, + +1.87601316108706159478e+10, + +3.88231277496238566008e+12, + +6.20557727146953693363e+14, + +6.87141087355300489866e+16, + +3.97270608116560655612e+18, + }; + + if (x <= T(5.0)) { + if (x == T(0.0)) { + return NEG_INFINITY; + } + + if (x <= T(0.0)) { + return NAN; + } + + T yp = 0.0; + + for (uint8_t index = 0; index <= 5; index++) { + yp = yp * (x * x) + YP[index]; + } + + T yq = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + yq = yq * (x * x) + YQ[index]; + } + + return x * (yp / yq) + (T(0.636619772367581343075535053490057448) * (bessel_j1_forward(x) * log(x) - T(1.0) / x)); + } + + T pp = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + pp = pp * (T(5.0) / x * (T(5.0) / x)) + PP[index]; + } + + T pq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + pq = pq * (T(5.0) / x * (T(5.0) / x)) + PQ[index]; + } + + T qp = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + qp = qp * (T(5.0) / x * (T(5.0) / x)) + QP[index]; + } + + T qq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + qq = qq * (T(5.0) / x * (T(5.0) / x)) + QQ[index]; + } + + return (pp / pq * sin(x - T(2.356194490192344928846982537459627163)) + T(5.0) / x * (qp / qq) * cos(x - T(2.356194490192344928846982537459627163))) * T(0.797884560802865355879892119868763737) / sqrt(x); + } // bessel_y1_forward(T x) +); // bessel_y1_string + +const auto chebyshev_polynomial_t_string = jiterator_stringify( + template + T chebyshev_polynomial_t_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (abs(x) == T(1.0)) { + if (x > T(0.0) || n % 2 == 0) { + return T(1.0); + } + + return T(-1.0); + } + + if ((n > 6) && (abs(x) < T(1.0))) { + return cos(n * acos(x)); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x; + } + + T p = T(1.0); + T q = x; + T r; + + for (int64_t k = 2; k <= n; k++) { + r = (x + x) * q - p; + p = q; + q = r; + } + + return r; + } // chebyshev_polynomial_t_forward(T x, int64_t n) + + template + T chebyshev_polynomial_t_forward(T x, T n) { + return chebyshev_polynomial_t_forward(x, static_cast(n)); + } // chebyshev_polynomial_t_forward(T x, T n) +); // chebyshev_polynomial_t_string + +const auto chebyshev_polynomial_u_string = jiterator_stringify( + template + T chebyshev_polynomial_u_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (abs(x) == T(1.0)) { + if (x > T(0.0) || n % 2 == 0) { + return n + 1; + } + + return -(n + 1); + } + + if ((n > 8) && (abs(x) < T(1.0))) { + if (sin(acos(x)) != T(0.0)) { + return sin((n + 1) * acos(x)) / sin(acos(x)); + } + + return (n + 1) * cos((n + 1) * acos(x)) / x; + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x + x; + } + + T p = T(1.0); + T q = x + x; + T r; + + for (int64_t k = 2; k <= n; k++) { + r = (x + x) * q - p; + p = q; + q = r; + } + + return r; + } // chebyshev_polynomial_u_forward(T x, int64_t n) + + template + T chebyshev_polynomial_u_forward(T x, T n) { + return chebyshev_polynomial_u_forward(x, static_cast(n)); + } // chebyshev_polynomial_u_forward(T x, T n) +); // chebyshev_polynomial_u_string + +const auto chebyshev_polynomial_v_string = jiterator_stringify( + template + T chebyshev_polynomial_v_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (abs(x) == T(1.0)) { + if (x > T(0.0)) { + return T(1.0); + } + + if (n % 2 == 0) { + return n + n + 1; + } + + return -(n + n + 1); + } + + if ((n > 8) && (abs(x) < T(1.0))) { + if (sin(acos(x) / T(2.0)) != T(1.0)) { + return cos((n + T(0.5)) * acos(x)) / cos(acos(x) / T(2.0)); + } + + if (n % 2 == 0) { + return n + n + 1; + } + + return -(n + n + 1); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x + x - T(1.0); + } + + T p = T(1.0); + T q = x + x - T(1.0); + T r; + + for (int64_t k = 2; k <= n; k++) { + r = (x + x) * q - p; + p = q; + q = r; + } + + return r; + } // chebyshev_polynomial_v_forward(T x, int64_t n) + + template + T chebyshev_polynomial_v_forward(T x, T n) { + return chebyshev_polynomial_v_forward(x, static_cast(n)); + } // chebyshev_polynomial_v_forward(T x, T n) +); // chebyshev_polynomial_v_string + +const auto chebyshev_polynomial_w_string = jiterator_stringify( + template + T chebyshev_polynomial_w_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (abs(x) == T(1.0)) { + if (x > T(0.0)) { + return n + n + 1; + } + + if (n % 2 == 0) { + return T(1.0); + } + + return T(-1.0); + } + + if ((n > 8) && (abs(x) < T(1.0))) { + if (cos(acos(x) / T(2.0)) != T(1.0)) { + return sin((n + T(0.5)) * acos(x)) / sin(acos(x) / T(2.0)); + } + + if (x > T(0.0)) { + return n + n + 1; + } + + if (n % 2 == 0) { + return T(1.0); + } + + return T(-1.0); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x + x + T(1.0); + } + + T p = T(1.0); + T q = x + x + T(1.0); + T r; + + for (int64_t k = 2; k <= n; k++) { + r = (x + x) * q - p; + p = q; + q = r; + } + + return r; + } // chebyshev_polynomial_w_forward(T x, int64_t n) + + template + T chebyshev_polynomial_w_forward(T x, T n) { + return chebyshev_polynomial_w_forward(x, static_cast(n)); + } // chebyshev_polynomial_w_forward(T x, T n) +); // chebyshev_polynomial_w_string + +const auto hermite_polynomial_h_string = jiterator_stringify( + template + T hermite_polynomial_h_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x + x; + } + + T p = T(1.0); + T q = x + x; + T r = T(0.0); + + for (int64_t k = 2; k < n + n; k += 2) { + r = (x + x) * q - k * p; + p = q; + q = r; + } + + return r; + } // hermite_polynomial_h_forward(T x, int64_t n) + + template + T hermite_polynomial_h_forward(T x, T n) { + return hermite_polynomial_h_forward(x, static_cast(n)); + } // hermite_polynomial_h_forward(T x, T n) +); // hermite_polynomial_h_string + +const auto hermite_polynomial_he_string = jiterator_stringify( + template + T hermite_polynomial_he_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x; + } + + T p = T(1.0); + T q = x; + T r; + + for (int64_t k = 1; k < n; k++) { + r = x * q - k * p; + p = q; + q = r; + } + + return r; + } // hermite_polynomial_he_forward(T x, int64_t n) + + template + T hermite_polynomial_he_forward(T x, T n) { + return hermite_polynomial_he_forward(x, static_cast(n)); + } // hermite_polynomial_he_forward(T x, T n) +); // hermite_polynomial_he_string + +const auto laguerre_polynomial_l_string = jiterator_stringify( + template + T laguerre_polynomial_l_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (abs(x) == T(0.0)) { + return T(1.0); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return T(1.0) - x; + } + + T p = T(1.0); + T q = T(1.0) - x; + T r; + + for (int64_t k = 1; k < n; k++) { + r = (((k + k) + (T(1.0) - x)) * q - k * p) / (k + 1); + p = q; + q = r; + } + + return r; + } // laguerre_polynomial_l_forward(T x, int64_t n) + + template + T laguerre_polynomial_l_forward(T x, T n) { + return laguerre_polynomial_l_forward(x, static_cast(n)); + } // laguerre_polynomial_l_forward(T x, T n) +); // laguerre_polynomial_l_string + +const auto legendre_polynomial_p_string = jiterator_stringify( + template + T legendre_polynomial_p_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (abs(x) == T(1.0)) { + if (x > T(0.0) || n % 2 == 0) { + return T(1.0); + } + + return T(-1.0); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x; + } + + T p = T(1.0); + T q = x; + T r; + + for (int64_t k = 1; k < n; k++) { + r = ((k + k + 1) * x * q - k * p) / (k + 1); + p = q; + q = r; + } + + return r; + } // legendre_polynomial_p_forward(T x, int64_t n) + + template + T legendre_polynomial_p_forward(T x, T n) { + return legendre_polynomial_p_forward(x, static_cast(n)); + } // legendre_polynomial_p_forward(T x, T n) +); // legendre_polynomial_p_string + +const auto modified_bessel_i0_string = jiterator_stringify( + template + T modified_bessel_i0_forward(T x) { + static const T A[] = { + -4.41534164647933937950e-18, + +3.33079451882223809783e-17, + -2.43127984654795469359e-16, + +1.71539128555513303061e-15, + -1.16853328779934516808e-14, + +7.67618549860493561688e-14, + -4.85644678311192946090e-13, + +2.95505266312963983461e-12, + -1.72682629144155570723e-11, + +9.67580903537323691224e-11, + -5.18979560163526290666e-10, + +2.65982372468238665035e-09, + -1.30002500998624804212e-08, + +6.04699502254191894932e-08, + -2.67079385394061173391e-07, + +1.11738753912010371815e-06, + -4.41673835845875056359e-06, + +1.64484480707288970893e-05, + -5.75419501008210370398e-05, + +1.88502885095841655729e-04, + -5.76375574538582365885e-04, + +1.63947561694133579842e-03, + -4.32430999505057594430e-03, + +1.05464603945949983183e-02, + -2.37374148058994688156e-02, + +4.93052842396707084878e-02, + -9.49010970480476444210e-02, + +1.71620901522208775349e-01, + -3.04682672343198398683e-01, + +6.76795274409476084995e-01, + }; + + static const T B[] = { + -7.23318048787475395456e-18, + -4.83050448594418207126e-18, + +4.46562142029675999901e-17, + +3.46122286769746109310e-17, + -2.82762398051658348494e-16, + -3.42548561967721913462e-16, + +1.77256013305652638360e-15, + +3.81168066935262242075e-15, + -9.55484669882830764870e-15, + -4.15056934728722208663e-14, + +1.54008621752140982691e-14, + +3.85277838274214270114e-13, + +7.18012445138366623367e-13, + -1.79417853150680611778e-12, + -1.32158118404477131188e-11, + -3.14991652796324136454e-11, + +1.18891471078464383424e-11, + +4.94060238822496958910e-10, + +3.39623202570838634515e-09, + +2.26666899049817806459e-08, + +2.04891858946906374183e-07, + +2.89137052083475648297e-06, + +6.88975834691682398426e-05, + +3.36911647825569408990e-03, + +8.04490411014108831608e-01, + }; + + T p; + T q = 0.0; + + if (abs(x) <= T(8.0)) { + T a = A[0]; + + for (uint8_t index = 1; index < 30; index++) { + p = q; + q = a; + a = ((abs(x) / T(2.0)) - T(2.0)) * q - p + A[index]; + } + + return exp(abs(x)) * (T(0.5) * (a - p)); + } + + T b = B[0]; + + for (uint8_t index = 1; index < 25; index++) { + p = q; + q = b; + b = (T(32.0) / abs(x) - T(2.0)) * q - p + B[index]; + } + + return exp(abs(x)) * (T(0.5) * (b - p)) / sqrt(abs(x)); + } // modified_bessel_i0_forward(T x) +); // modified_bessel_i0_string + +const auto modified_bessel_i1_string = jiterator_stringify( + template + T modified_bessel_i1_forward(T x) { + static const T A[] = { + +2.77791411276104639959e-18, + -2.11142121435816608115e-17, + +1.55363195773620046921e-16, + -1.10559694773538630805e-15, + +7.60068429473540693410e-15, + -5.04218550472791168711e-14, + +3.22379336594557470981e-13, + -1.98397439776494371520e-12, + +1.17361862988909016308e-11, + -6.66348972350202774223e-11, + +3.62559028155211703701e-10, + -1.88724975172282928790e-09, + +9.38153738649577178388e-09, + -4.44505912879632808065e-08, + +2.00329475355213526229e-07, + -8.56872026469545474066e-07, + +3.47025130813767847674e-06, + -1.32731636560394358279e-05, + +4.78156510755005422638e-05, + -1.61760815825896745588e-04, + +5.12285956168575772895e-04, + -1.51357245063125314899e-03, + +4.15642294431288815669e-03, + -1.05640848946261981558e-02, + +2.47264490306265168283e-02, + -5.29459812080949914269e-02, + +1.02643658689847095384e-01, + -1.76416518357834055153e-01, + +2.52587186443633654823e-01, + }; + + static const T B[] = { + +7.51729631084210481353e-18, + +4.41434832307170791151e-18, + -4.65030536848935832153e-17, + -3.20952592199342395980e-17, + +2.96262899764595013876e-16, + +3.30820231092092828324e-16, + -1.88035477551078244854e-15, + -3.81440307243700780478e-15, + +1.04202769841288027642e-14, + +4.27244001671195135429e-14, + -2.10154184277266431302e-14, + -4.08355111109219731823e-13, + -7.19855177624590851209e-13, + +2.03562854414708950722e-12, + +1.41258074366137813316e-11, + +3.25260358301548823856e-11, + -1.89749581235054123450e-11, + -5.58974346219658380687e-10, + -3.83538038596423702205e-09, + -2.63146884688951950684e-08, + -2.51223623787020892529e-07, + -3.88256480887769039346e-06, + -1.10588938762623716291e-04, + -9.76109749136146840777e-03, + +7.78576235018280120474e-01, + }; + + T p; + T q = 0.0; + + if (abs(x) <= T(8.0)) { + T a = A[0]; + + for (uint8_t index = 1; index < 29; index++) { + p = q; + q = a; + a = ((abs(x) / T(2.0)) - T(2.0)) * q - p + A[index]; + } + + if (x < T(0.0)) { + return -(T(0.5) * (a - p) * abs(x) * exp(abs(x))); + } + + return T(0.5) * (a - p) * abs(x) * exp(abs(x)); + } + + T b = B[0]; + + for (uint8_t index = 1; index < 25; index++) { + p = q; + q = b; + b = (T(32.0) / abs(x) - T(2.0)) * q - p + B[index]; + } + + if (x < T(0.0)) { + return -(exp(abs(x)) * (T(0.5) * (b - p)) / sqrt(abs(x))); + } + + return exp(abs(x)) * (T(0.5) * (b - p)) / sqrt(abs(x)); + } // modified_bessel_i1_forward(T x) +); // modified_bessel_i1_string + +const auto modified_bessel_k0_string = modified_bessel_i0_string + jiterator_stringify( + template + T modified_bessel_k0_forward(T x) { + static const T A[] = { + +1.37446543561352307156e-16, + +4.25981614279661018399e-14, + +1.03496952576338420167e-11, + +1.90451637722020886025e-09, + +2.53479107902614945675e-07, + +2.28621210311945178607e-05, + +1.26461541144692592338e-03, + +3.59799365153615016266e-02, + +3.44289899924628486886e-01, + -5.35327393233902768720e-01, + }; + + static const T B[] = { + +5.30043377268626276149e-18, + -1.64758043015242134646e-17, + +5.21039150503902756861e-17, + -1.67823109680541210385e-16, + +5.51205597852431940784e-16, + -1.84859337734377901440e-15, + +6.34007647740507060557e-15, + -2.22751332699166985548e-14, + +8.03289077536357521100e-14, + -2.98009692317273043925e-13, + +1.14034058820847496303e-12, + -4.51459788337394416547e-12, + +1.85594911495471785253e-11, + -7.95748924447710747776e-11, + +3.57739728140030116597e-10, + -1.69753450938905987466e-09, + +8.57403401741422608519e-09, + -4.66048989768794782956e-08, + +2.76681363944501510342e-07, + -1.83175552271911948767e-06, + +1.39498137188764993662e-05, + -1.28495495816278026384e-04, + +1.56988388573005337491e-03, + -3.14481013119645005427e-02, + +2.44030308206595545468e+00, + }; + + if (x == T(0.0)) { + return INFINITY; + } + + if (x < T(0.0)) { + return NAN; + } + + T p; + T q = 0.0; + + if (x <= T(2.0)) { + T a = A[0]; + + for (uint8_t index = 1; index < 10; index++) { + p = q; + q = a; + a = (x * x - T(2.0)) * q - p + A[index]; + } + + return T(0.5) * (a - p) - log(0.5 * x) * modified_bessel_i0_forward(x); + } + + T b = B[0]; + + for (uint8_t index = 1; index < 25; index++) { + p = q; + q = b; + b = (T(8.0) / x - T(2.0)) * q - p + B[index]; + } + + return exp(-x) * (T(0.5) * (b - p)) / sqrt(x); + } // modified_bessel_k0_forward(T x) +); // modified_bessel_k0_string + +const auto scaled_modified_bessel_k0_string = modified_bessel_i0_string + jiterator_stringify( + template + T scaled_modified_bessel_k0_forward(T x) { + static const T A[] = { + +1.37446543561352307156e-16, + +4.25981614279661018399e-14, + +1.03496952576338420167e-11, + +1.90451637722020886025e-09, + +2.53479107902614945675e-07, + +2.28621210311945178607e-05, + +1.26461541144692592338e-03, + +3.59799365153615016266e-02, + +3.44289899924628486886e-01, + -5.35327393233902768720e-01, + }; + + static const T B[] = { + +5.30043377268626276149e-18, + -1.64758043015242134646e-17, + +5.21039150503902756861e-17, + -1.67823109680541210385e-16, + +5.51205597852431940784e-16, + -1.84859337734377901440e-15, + +6.34007647740507060557e-15, + -2.22751332699166985548e-14, + +8.03289077536357521100e-14, + -2.98009692317273043925e-13, + +1.14034058820847496303e-12, + -4.51459788337394416547e-12, + +1.85594911495471785253e-11, + -7.95748924447710747776e-11, + +3.57739728140030116597e-10, + -1.69753450938905987466e-09, + +8.57403401741422608519e-09, + -4.66048989768794782956e-08, + +2.76681363944501510342e-07, + -1.83175552271911948767e-06, + +1.39498137188764993662e-05, + -1.28495495816278026384e-04, + +1.56988388573005337491e-03, + -3.14481013119645005427e-02, + +2.44030308206595545468e+00, + }; + + if (x == T(0.0)) { + return INFINITY; + } + + if (x < T(0.0)) { + return NAN; + } + + T p; + T q = 0.0; + + if (x <= T(2.0)) { + T a = A[0]; + + for (uint8_t index = 1; index < 10; index++) { + p = q; + q = a; + a = (x * x - T(2.0)) * q - p + A[index]; + } + + return (T(0.5) * (a - p) - log(T(0.5) * x) * modified_bessel_i0_forward(x)) * exp(x); + } + + T b = B[0]; + + for (uint8_t index = 1; index < 25; index++) { + p = q; + q = b; + b = (T(8.0) / x - T(2.0)) * q - p + B[index]; + } + + return T(0.5) * (b - p) / sqrt(x); + } // T scaled_modified_bessel_k0_forward(T x) +); // scaled_modified_bessel_k0_string + +const auto modified_bessel_k1_string = modified_bessel_i1_string + jiterator_stringify( + template + T modified_bessel_k1_forward(T x) { + static const T A[] = { + -7.02386347938628759343e-18, + -2.42744985051936593393e-15, + -6.66690169419932900609e-13, + -1.41148839263352776110e-10, + -2.21338763073472585583e-08, + -2.43340614156596823496e-06, + -1.73028895751305206302e-04, + -6.97572385963986435018e-03, + -1.22611180822657148235e-01, + -3.53155960776544875667e-01, + +1.52530022733894777053e+00, + }; + + static const T B[] = { + -5.75674448366501715755e-18, + +1.79405087314755922667e-17, + -5.68946255844285935196e-17, + +1.83809354436663880070e-16, + -6.05704724837331885336e-16, + +2.03870316562433424052e-15, + -7.01983709041831346144e-15, + +2.47715442448130437068e-14, + -8.97670518232499435011e-14, + +3.34841966607842919884e-13, + -1.28917396095102890680e-12, + +5.13963967348173025100e-12, + -2.12996783842756842877e-11, + +9.21831518760500529508e-11, + -4.19035475934189648750e-10, + +2.01504975519703286596e-09, + -1.03457624656780970260e-08, + +5.74108412545004946722e-08, + -3.50196060308781257119e-07, + +2.40648494783721712015e-06, + -1.93619797416608296024e-05, + +1.95215518471351631108e-04, + -2.85781685962277938680e-03, + +1.03923736576817238437e-01, + +2.72062619048444266945e+00, + }; + + if (x == T(0.0)) { + return INFINITY; + } + + if (x < T(0.0)) { + return NAN; + } + + T p; + T q = 0.0; + + if (x <= T(2.0)) { + T a = A[0]; + + for (uint8_t index = 1; index < 11; index++) { + p = q; + q = a; + a = (x * x - T(2.0)) * q - p + A[index]; + } + + return log(T(0.5) * x) * modified_bessel_i1_forward(x) + T(0.5) * (a - p) / x; + } + + T b = B[0]; + + for (uint8_t index = 1; index < 25; index++) { + p = q; + q = b; + b = (T(8.0) / x - T(2.0)) * q - p + B[index]; + } + + return exp(-x) * (T(0.5) * (b - p)) / sqrt(x); + } // modified_bessel_k1_forward(T x) +); // modified_bessel_k1_string + +const auto scaled_modified_bessel_k1_string = modified_bessel_i1_string + jiterator_stringify( + template + T scaled_modified_bessel_k1_forward(T x) { + static const T A[] = { + -7.02386347938628759343e-18, + -2.42744985051936593393e-15, + -6.66690169419932900609e-13, + -1.41148839263352776110e-10, + -2.21338763073472585583e-08, + -2.43340614156596823496e-06, + -1.73028895751305206302e-04, + -6.97572385963986435018e-03, + -1.22611180822657148235e-01, + -3.53155960776544875667e-01, + +1.52530022733894777053e+00, + }; + + static const T B[] = { + -5.75674448366501715755e-18, + +1.79405087314755922667e-17, + -5.68946255844285935196e-17, + +1.83809354436663880070e-16, + -6.05704724837331885336e-16, + +2.03870316562433424052e-15, + -7.01983709041831346144e-15, + +2.47715442448130437068e-14, + -8.97670518232499435011e-14, + +3.34841966607842919884e-13, + -1.28917396095102890680e-12, + +5.13963967348173025100e-12, + -2.12996783842756842877e-11, + +9.21831518760500529508e-11, + -4.19035475934189648750e-10, + +2.01504975519703286596e-09, + -1.03457624656780970260e-08, + +5.74108412545004946722e-08, + -3.50196060308781257119e-07, + +2.40648494783721712015e-06, + -1.93619797416608296024e-05, + +1.95215518471351631108e-04, + -2.85781685962277938680e-03, + +1.03923736576817238437e-01, + +2.72062619048444266945e+00, + }; + + if (x == T(0.0)) { + return INFINITY; + } + + if (x < T(0.0)) { + return NAN; + } + + T p; + T q = 0.0; + + if (x <= T(2.0)) { + T a = A[0]; + + for (uint8_t index = 1; index < 11; index++) { + p = q; + q = a; + a = (x * x - T(2.0)) * q - p + A[index]; + } + + return (log(T(0.5) * x) * modified_bessel_i1_forward(x) + T(0.5) * (a - p) / x) * exp(x); + } + + T b = B[0]; + + for (uint8_t index = 1; index < 25; index++) { + p = q; + q = b; + b = (T(8.0) / x - T(2.0)) * q - p + B[index]; + } + + return (T(0.5) * (b - p) / sqrt(x)); + } // T scaled_modified_bessel_k1_forward(T x) +); // scaled_modified_bessel_k1_string + +const auto shifted_chebyshev_polynomial_t_string = jiterator_stringify( + template + T shifted_chebyshev_polynomial_t_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (x == T(1.0)) { + return T(1.0); + } + + if (x == T(0.0)) { + if (n % 2 == 0) { + return T(1.0); + } + + return T(-1.0); + } + + if ((n > 6) && (abs(x + x - T(1.0)) < T(1.0))) { + return cos(n * acos(x + x - T(1.0))); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x + x - T(1.0); + } + + T p = T(1.0); + T q = x + x - T(1.0); + T r; + + for (int64_t k = 2; k <= n; k++) { + r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p; + p = q; + q = r; + } + + return r; + } // shifted_chebyshev_polynomial_t_forward(T x, int64_t n) + + template + T shifted_chebyshev_polynomial_t_forward(T x, T n) { + return shifted_chebyshev_polynomial_t_forward(x, static_cast(n)); + } // shifted_chebyshev_polynomial_t_forward(T x, T n) +); // shifted_chebyshev_polynomial_t_string + +const auto shifted_chebyshev_polynomial_u_string = jiterator_stringify( + template + T shifted_chebyshev_polynomial_u_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (x == T(1.0)) { + return n + 1; + } + + if (x == T(0.0)) { + if (n % 2 == 0) { + return n + 1; + } + + return -(n + 1); + } + + if ((n > 6) && (abs(x + x - T(1.0)) < T(1.0))) { + if (sin(acos(x + x - T(1.0))) != T(0.0)) { + return sin((n + 1) * acos(x + x - T(1.0))) / sin(acos(x + x - T(1.0))); + } + + return (n + 1) * cos((n + 1) * acos(x + x - T(1.0))) / (x + x - T(1.0)); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x + x - T(1.0) + (x + x - T(1.0)); + } + + T p = T(1.0); + T q = x + x - T(1.0) + (x + x - T(1.0)); + T r; + + for (int64_t k = 2; k <= n; k++) { + r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p; + p = q; + q = r; + } + + return r; + } // shifted_chebyshev_polynomial_u_forward(T x, int64_t n) + + template + T shifted_chebyshev_polynomial_u_forward(T x, T n) { + return shifted_chebyshev_polynomial_u_forward(x, static_cast(n)); + } // shifted_chebyshev_polynomial_u_forward(T x, T n) +); // shifted_chebyshev_polynomial_u_string + +const auto shifted_chebyshev_polynomial_v_string = jiterator_stringify( + template + T shifted_chebyshev_polynomial_v_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (x == T(1.0)) { + return T(1.0); + } + + if (x == T(0.0)) { + if (n % 2 == 0) { + return (n + n + 1); + } + + return -(n + n + 1); + } + + if ((n > 6) && (abs(x + x - T(1.0)) < T(1.0))) { + if (sin(acos(x + x - T(1.0)) / T(2.0)) != T(1.0)) { + return cos(((n) + T(0.5)) * acos(x + x - T(1.0))) / cos(acos(x + x - T(1.0)) / T(2.0)); + } + + if (n % 2 == 0) { + return n + n + 1; + } + + return -(n + n + 1); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x + x - T(1.0) + (x + x - T(1.0)) - T(1.0); + } + + T p = T(1.0); + T q = x + x - T(1.0) + (x + x - T(1.0)) - T(1.0); + T r; + + for (int64_t k = 2; k <= n; k++) { + r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p; + p = q; + q = r; + } + + return r; + } // shifted_chebyshev_polynomial_v_forward(T x, int64_t n) + + template + T shifted_chebyshev_polynomial_v_forward(T x, T n) { + return shifted_chebyshev_polynomial_v_forward(x, static_cast(n)); + } // shifted_chebyshev_polynomial_v_forward(T x, T n) +); // shifted_chebyshev_polynomial_v_string + +const auto shifted_chebyshev_polynomial_w_string = jiterator_stringify( + template + T shifted_chebyshev_polynomial_w_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (x == T(1.0)) { + return n + n + 1; + } + + if (x == T(0.0)) { + if (n % 2 == 0) { + return T(1.0); + } + + return T(-1.0); + } + + if ((n > 4) && (abs(x + x - T(1.0)) < T(1.0))) { + if (cos(acos(x + x - T(1.0)) / T(2.0)) != T(1.0)) { + return sin((n + T(0.5)) * acos(x + x - T(1.0))) / sin(acos(x + x - T(1.0)) / T(2.0)); + } + + if (n % 2 == 0) { + return T(1.0); + } + + return T(-1.0); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x + x - T(1.0) + (x + x - T(1.0)) + T(1.0); + } + + T p = T(1.0); + T q = x + x - T(1.0) + (x + x - T(1.0)) + T(1.0); + T r; + + for (int64_t k = 2; k <= n; k++) { + r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p; + p = q; + q = r; + } + + return r; + } // shifted_chebyshev_polynomial_w_forward(T x, int64_t n) + + template + T shifted_chebyshev_polynomial_w_forward(T x, T n) { + return shifted_chebyshev_polynomial_w_forward(x, static_cast(n)); + } // shifted_chebyshev_polynomial_w_forward(T x, T n) +); // shifted_chebyshev_polynomial_w_string + +const auto spherical_bessel_j0_string = jiterator_stringify( + template + T spherical_bessel_j0_forward(T x) { + if (isinf(x)) { + return T(0.0); + } + + if (abs(x) < T(0.5)) { + return T(1.0) + x * x * (T(-1.0) / T(6.0) + x * x * (T(1.0) / T(120.0) + x * x * (T(-1.0) / T(5040.0) + x * x * (T(1.0) / T(362880.0) + x * x * (T(-1.0) / T(39916800.0) + x * x * (T(1.0) / T(6227020800.0))))))); + } + + return sin(x) / x; + } // T spherical_bessel_j0_forward(T x) +); // spherical_bessel_j0_string + +#else // !AT_USE_JITERATOR() -- kernels must be precompiled + +template +static inline C10_HOST_DEVICE scalar_t calc_gcd(scalar_t a_in, scalar_t b_in) { + scalar_t a = ::abs(a_in); + scalar_t b = ::abs(b_in); + while (a != 0) { + scalar_t c = a; + a = b % a; + b = c; + } + return b; +} + +/* + * For licensing information, please refer to the cpu implementation located in "ATen/native/Math.h". + */ +template +static inline C10_HOST_DEVICE scalar_t calc_digamma(scalar_t in) { + // [C++ Standard Reference: Gamma Function] https://en.cppreference.com/w/cpp/numeric/math/tgamma + using accscalar_t = at::acc_type; + static const double PI_f64 = 3.14159265358979323846; + const accscalar_t PSI_10 = 2.25175258906672110764; + const accscalar_t A[] = { + 8.33333333333333333333E-2, + -2.10927960927960927961E-2, + 7.57575757575757575758E-3, + -4.16666666666666666667E-3, + 3.96825396825396825397E-3, + -8.33333333333333333333E-3, + 8.33333333333333333333E-2, + }; + + accscalar_t x = static_cast(in); + if (x == 0) { + // As per C++ standard for gamma related functions and SciPy, + // If the argument is ±0, ±∞ is returned + return std::copysign(static_cast(INFINITY), -x); + } + + bool x_is_integer = x == ::trunc(x); + accscalar_t result = 0; + if (x < 0) { + if (x_is_integer) { + // As per C++ standard for gamma related functions and SciPy, + // If the argument is a negative integer, NaN is returned + return static_cast(NAN); + } + // Extracts the fractional part of x as r, since tan(pi * r) is more numerically + // accurate than tan(pi * x). While these operations are mathematically equivalent + // since both x and r are in radians and tan() has a periodicity of pi, in practice + // the computation of pi * x is a source of error (when |x| > 1). + double q, r; + r = ::modf(static_cast(x), &q); + result = static_cast(- PI_f64 / ::tan(PI_f64 * r)); + x = 1 - x; + } + + while (x < 10) { + result -= 1 / x; + x += 1; + } + if (x == 10) { + return static_cast(result + PSI_10); + } + + accscalar_t y = 0; + if (x < 1.0e17) { + accscalar_t z = 1 / (x * x); + + accscalar_t polevl_result = 0; + for (int i = 0; i <= 6; i++) { + polevl_result = polevl_result * z + A[i]; + } + y = z * polevl_result; + } + + return static_cast(::log(x) - (static_cast(0.5) / x) - y + result); +} + +template +static inline C10_HOST_DEVICE scalar_t calc_trigamma(scalar_t in) { + using accscalar_t = at::acc_type; + const accscalar_t PI = 3.14159265358979323846; + accscalar_t x = static_cast(in); + accscalar_t sign = +1; + accscalar_t result = 0; + if (x < 0.5f) { + sign = -1; + accscalar_t sin_pi_x = ::sin(PI * x); + result -= (PI * PI) / (sin_pi_x * sin_pi_x); + x = 1 - x; + } + for (int i = 0; i < 6; ++i) { + result += 1 / (x * x); + x += 1; + } + const accscalar_t one = static_cast(1); + const accscalar_t ixx = 1 / (x*x); + result += (1 + 1 / (2*x) + ixx * (one/6 - ixx * (one/30 - ixx * (one/42)))) / x; + return static_cast(sign * result); +} + +/* + * For licensing information and documentation, please refer to the cpu implementation located in "ATen/native/Math.h". + */ +template +static inline C10_HOST_DEVICE scalar_t +chbevl(scalar_t _x, const scalar_t array[], size_t len) { + static_assert(!std::is_same() && !std::is_same(), "don't instantiate with low precision type"); + + scalar_t b0, b1, b2; + + b0 = array[0]; + b1 = 0; + + for (size_t i = 1; i < len; ++i) { + b2 = b1; + b1 = b0; + b0 = _x * b1 - b2 + array[i]; + } + + return (0.5 * (b0 - b2)); +} + +/* + * For licensing information and documentation, please refer to the cpu implementation located in "ATen/native/Math.h". + */ +template +C10_HOST_DEVICE inline std::tuple chebyshev_coefficients_i0e_A() { + /* Chebyshev coefficients for exp(-x) I0(x) + * in the interval [0,8]. + * + * lim(x->0){ exp(-x) I0(x) } = 1. + */ + static const T coefficients[] = { + -4.41534164647933937950E-18, 3.33079451882223809783E-17, + -2.43127984654795469359E-16, 1.71539128555513303061E-15, + -1.16853328779934516808E-14, 7.67618549860493561688E-14, + -4.85644678311192946090E-13, 2.95505266312963983461E-12, + -1.72682629144155570723E-11, 9.67580903537323691224E-11, + -5.18979560163526290666E-10, 2.65982372468238665035E-9, + -1.30002500998624804212E-8, 6.04699502254191894932E-8, + -2.67079385394061173391E-7, 1.11738753912010371815E-6, + -4.41673835845875056359E-6, 1.64484480707288970893E-5, + -5.75419501008210370398E-5, 1.88502885095841655729E-4, + -5.76375574538582365885E-4, 1.63947561694133579842E-3, + -4.32430999505057594430E-3, 1.05464603945949983183E-2, + -2.37374148058994688156E-2, 4.93052842396707084878E-2, + -9.49010970480476444210E-2, 1.71620901522208775349E-1, + -3.04682672343198398683E-1, 6.76795274409476084995E-1}; + + return std::make_tuple(coefficients, 30); +} + +template +C10_HOST_DEVICE inline std::tuple chebyshev_coefficients_i0e_B() { + /* Chebyshev coefficients for exp(-x) sqrt(x) I0(x) + * in the inverted interval [8,infinity]. + * + * lim(x->inf){ exp(-x) sqrt(x) I0(x) } = 1/sqrt(2pi). + */ + static const T coefficients[] = { + -7.23318048787475395456E-18, -4.83050448594418207126E-18, + 4.46562142029675999901E-17, 3.46122286769746109310E-17, + -2.82762398051658348494E-16, -3.42548561967721913462E-16, + 1.77256013305652638360E-15, 3.81168066935262242075E-15, + -9.55484669882830764870E-15, -4.15056934728722208663E-14, + 1.54008621752140982691E-14, 3.85277838274214270114E-13, + 7.18012445138366623367E-13, -1.79417853150680611778E-12, + -1.32158118404477131188E-11, -3.14991652796324136454E-11, + 1.18891471078464383424E-11, 4.94060238822496958910E-10, + 3.39623202570838634515E-9, 2.26666899049817806459E-8, + 2.04891858946906374183E-7, 2.89137052083475648297E-6, + 6.88975834691682398426E-5, 3.36911647825569408990E-3, + 8.04490411014108831608E-1}; + + return std::make_tuple(coefficients, 25); +} + +template +static inline C10_HOST_DEVICE scalar_t calc_i0(scalar_t _x) { + static_assert(!std::is_same() && !std::is_same(), "don't instantiate with low precision type"); + // Upcast input for numerical accuracy purposes + // Needed for accurate results if input is bfloat16 or float16 + scalar_t x = ::abs(_x); + + if (x <= scalar_t{8.0}) { + auto coeff_pair = chebyshev_coefficients_i0e_A(); + auto A = std::get<0>(coeff_pair); + auto len = std::get<1>(coeff_pair); + scalar_t y = (x / scalar_t{2.0}) - scalar_t{2.0}; + return (::exp(x) * chbevl(y, A, len)); + } + + auto coeff_pair = chebyshev_coefficients_i0e_B(); + auto B = std::get<0>(coeff_pair); + auto len = std::get<1>(coeff_pair); + return (::exp(x) * chbevl(scalar_t{32.0} / x - scalar_t{2.0}, B, len) / ::sqrt(x)); +} + +template +C10_HOST_DEVICE inline + typename std::enable_if::value, std::tuple>::type + chebyshev_coefficients_i1e_A() { + /* Chebyshev coefficients for exp(-x) I1(x) + * in the interval [0,8]. + * + * lim(x->0){ exp(-x) I1(x) / x } = 1/2. + */ + static const T coefficients[] = { + 2.77791411276104639959E-18, -2.11142121435816608115E-17, + 1.55363195773620046921E-16, -1.10559694773538630805E-15, + 7.60068429473540693410E-15, -5.04218550472791168711E-14, + 3.22379336594557470981E-13, -1.98397439776494371520E-12, + 1.17361862988909016308E-11, -6.66348972350202774223E-11, + 3.62559028155211703701E-10, -1.88724975172282928790E-9, + 9.38153738649577178388E-9, -4.44505912879632808065E-8, + 2.00329475355213526229E-7, -8.56872026469545474066E-7, + 3.47025130813767847674E-6, -1.32731636560394358279E-5, + 4.78156510755005422638E-5, -1.61760815825896745588E-4, + 5.12285956168575772895E-4, -1.51357245063125314899E-3, + 4.15642294431288815669E-3, -1.05640848946261981558E-2, + 2.47264490306265168283E-2, -5.29459812080949914269E-2, + 1.02643658689847095384E-1, -1.76416518357834055153E-1, + 2.52587186443633654823E-1}; + + return std::make_tuple(coefficients, 29); +} + +template +C10_HOST_DEVICE inline + typename std::enable_if::value, std::tuple>::type + chebyshev_coefficients_i1e_A() { + /* Chebyshev coefficients for exp(-x) I1(x) + * in the interval [0,8]. + * + * lim(x->0){ exp(-x) I1(x) / x } = 1/2. + */ + static const T coeff[] = { + 9.38153738649577178388E-9f, + -4.44505912879632808065E-8f, + 2.00329475355213526229E-7f, + -8.56872026469545474066E-7f, + 3.47025130813767847674E-6f, + -1.32731636560394358279E-5f, + 4.78156510755005422638E-5f, + -1.61760815825896745588E-4f, + 5.12285956168575772895E-4f, + -1.51357245063125314899E-3f, + 4.15642294431288815669E-3f, + -1.05640848946261981558E-2f, + 2.47264490306265168283E-2f, + -5.29459812080949914269E-2f, + 1.02643658689847095384E-1f, + -1.76416518357834055153E-1f, + 2.52587186443633654823E-1f}; + return std::make_tuple(coeff, 17); +}; + +template +C10_HOST_DEVICE inline + typename std::enable_if::value, std::tuple>::type + chebyshev_coefficients_i1e_B() { + /* Chebyshev coefficients for exp(-x) sqrt(x) I1(x) + * in the inverted interval [8,infinity]. + * + * lim(x->inf){ exp(-x) sqrt(x) I1(x) } = 1/sqrt(2pi). + */ + static const T coefficients[] = { + 7.51729631084210481353E-18, 4.41434832307170791151E-18, + -4.65030536848935832153E-17, -3.20952592199342395980E-17, + 2.96262899764595013876E-16, 3.30820231092092828324E-16, + -1.88035477551078244854E-15, -3.81440307243700780478E-15, + 1.04202769841288027642E-14, 4.27244001671195135429E-14, + -2.10154184277266431302E-14, -4.08355111109219731823E-13, + -7.19855177624590851209E-13, 2.03562854414708950722E-12, + 1.41258074366137813316E-11, 3.25260358301548823856E-11, + -1.89749581235054123450E-11, -5.58974346219658380687E-10, + -3.83538038596423702205E-9, -2.63146884688951950684E-8, + -2.51223623787020892529E-7, -3.88256480887769039346E-6, + -1.10588938762623716291E-4, -9.76109749136146840777E-3, + 7.78576235018280120474E-1}; + + return std::make_tuple(coefficients, 25); +} + +template +C10_HOST_DEVICE inline + typename std::enable_if::value, std::tuple>::type + chebyshev_coefficients_i1e_B() { + /* Chebyshev coefficients for exp(-x) sqrt(x) I1(x) + * in the inverted interval [8,infinity]. + * + * lim(x->inf){ exp(-x) sqrt(x) I1(x) } = 1/sqrt(2pi). + */ + static const T coeff[] = { + -3.83538038596423702205E-9f, + -2.63146884688951950684E-8f, + -2.51223623787020892529E-7f, + -3.88256480887769039346E-6f, + -1.10588938762623716291E-4f, + -9.76109749136146840777E-3f, + 7.78576235018280120474E-1f}; + + return std::make_tuple(coeff, 7); +}; + +template +static inline C10_HOST_DEVICE scalar_t calc_i1(scalar_t _x) { + const auto x = ::abs(_x); + if (x <= scalar_t{8.0}) { + auto coeff_pair = chebyshev_coefficients_i1e_A(); + auto A = std::get<0>(coeff_pair); + auto len = std::get<1>(coeff_pair); + scalar_t y = x / scalar_t{2.0} - scalar_t{2.0}; + const scalar_t out = ::exp(x) * x * chbevl(y, A, len); + return (_x < scalar_t{0.0}) ? -out : out; + } + + auto coeff_pair = chebyshev_coefficients_i1e_B(); + auto B = std::get<0>(coeff_pair); + auto len = std::get<1>(coeff_pair); + const scalar_t out = (::exp(x) * chbevl(scalar_t{32.0} / x - scalar_t{2.0}, B, len)) / ::sqrt(x); + return (_x < scalar_t{0.0}) ? -out : out; +} + +template +static inline C10_HOST_DEVICE scalar_t calc_i1e(scalar_t _x) { + const auto x = ::abs(_x); + if (x <= scalar_t{8.0}) { + auto coeff_pair = chebyshev_coefficients_i1e_A(); + auto A = std::get<0>(coeff_pair); + auto len = std::get<1>(coeff_pair); + const scalar_t y = x / scalar_t{2.0} - scalar_t{2.0}; + const scalar_t out = chbevl(y, A, len) * x; + return (_x < scalar_t{0.0}) ? -out : out; + } + + auto coeff_pair = chebyshev_coefficients_i1e_B(); + auto B = std::get<0>(coeff_pair); + auto len = std::get<1>(coeff_pair); + const scalar_t out = chbevl(scalar_t{32.0} / x - scalar_t{2.0}, B, len) / ::sqrt(x); + return (_x < scalar_t{0.0}) ? -out : out; +} + +#endif // AT_USE_JITERATOR() (this closes the "else" branch of a if/else preprocessor directive) + +} // namespace native +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/MiscUtils.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/MiscUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..e616a7d1fcfb8254528dccc4e6b9d0658ffe1a3c --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/MiscUtils.h @@ -0,0 +1,32 @@ +#pragma once +#include +#include +#include +#include + +namespace at { +namespace native { + +static inline int cuda_int_cast(int64_t value, const char* varname) { + auto result = static_cast(value); + TORCH_CHECK(static_cast(result) == value, + "cuda_int_cast: The value of ", varname, "(", (long long)value, + ") is too large to fit into a int (", sizeof(int), " bytes)"); + return result; +} + +// Creates an array of size elements of type T, backed by pinned memory +// wrapped in a Storage +template +static inline Storage pin_memory(int64_t size) { + auto* allocator = cuda::getPinnedMemoryAllocator(); + int64_t adjusted_size = size * sizeof(T); + return Storage( + Storage::use_byte_size_t(), + adjusted_size, + allocator, + /*resizable=*/false); +} + +} // namespace native +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/PersistentSoftmax.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/PersistentSoftmax.cuh new file mode 100644 index 0000000000000000000000000000000000000000..4553276bab684eca9d858bc81ae8f34c9445afa5 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/PersistentSoftmax.cuh @@ -0,0 +1,401 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include + +namespace { + +int log2_ceil(int value) { + int log2_value = 0; + while ((1 << log2_value) < value) ++log2_value; + return log2_value; +} + +template +struct Add { + __device__ __forceinline__ T operator()(T a, T b) const { + return a + b; + } +}; + +template +struct Max { + __device__ __forceinline__ T operator()(T a, T b) const { + return a < b ? b : a; + } +}; + +template class ReduceOp> +__device__ __forceinline__ void warp_reduce(acc_t* sum) { + ReduceOp r; + #pragma unroll + for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) { + #pragma unroll + for (int i = 0; i < WARP_BATCH; ++i) { + acc_t b = WARP_SHFL_XOR(sum[i], offset, WARP_SIZE); + sum[i] = r(sum[i], b); + } + } +} + +// The softmax_warp_* methods perform softmax forward and backward propagation on samples spanning the fast dimension. +// Each sample contains element_count scalar elements. element_count can be any integer value <= 1024. +// The template arguments have the following meaning: +// One "WARP" works on one "BATCH". One "BATCH" contains "WARP_BATCH" samples. +// WARP_BATCH is equal to 1 when element_count is large, and > 1 when element_count is small. +// A "WARP" contains "C10_WARPS_SIZE" threads, these treads are guaranteed to belong to the same warp. +// This is important because it means only __shfl_ instructions are required for reductions. +// Note that this means WARP_SIZE must be a power of two and <= architecture warp size. +// CUDA warp size is 32 for all existing GPU architectures, but there is no guarantee this will not change for future arch. +// ROCm warp size is 64 for all currently ROCm-supported GPU architectures, but this may change for future archs. +// is_log_softmax is a flag indicating whether SoftMax or LogSoftMax should be computed. +// is_masked is a flag indicating whether SoftMax or MaskedSoftMax should be computed. +// The template can be instantiated with any floating point type for the type arguments input_t, output_t and acc_t. +// This allows SoftMax to be fused with a cast immediately following the SoftMax. +// The mask should have the same shape as input, with a boolean indicate if the value is masked. +// The head_chunk_size is only used for transformer mask softmax, equals to H * D * D. +// For instance: +// input_t=half, acc_t=float, output_t=half => read half tensor, float accumulators, write half tensor. +// input_t=half, acc_t=float, output_t=float => read half tensor, float accumulators, write float tensor. +// input_t_float, acc_t=float, output_t=half => read float tensor, float accumulators, write half tensor. + +template +__global__ void softmax_warp_forward(output_t *dst, const input_t *src, int batch_size, int stride, int element_count, const bool *mask = nullptr, const int head_chunk_size = -1, bool is_transformer_mask = false) +{ + // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and warp_size of method warp_softmax_forward_kernel. + constexpr int next_power_of_two = 1 << log2_elements; + constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE; + constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE; + constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1; + + int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH; + + // batch_size might not be a multiple of WARP_BATCH. Check how + // many batches have to computed within this WARP. + int local_batches = batch_size - first_batch; + if (local_batches > WARP_BATCH) + local_batches = WARP_BATCH; + + // there might be multiple batches per warp. compute the index within the batch + int local_idx = threadIdx.x; + int idx_offset = first_batch * stride + local_idx; + + src += idx_offset; + dst += idx_offset; + + if (is_transformer_mask) { + mask += ((first_batch * stride) / head_chunk_size) * stride + local_idx; + } else { + mask += idx_offset; + } + // The nested loops over WARP_BATCH and then WARP_ITERATIONS can be simplified to one loop, + // but I think doing so would obfuscate the logic of the algorithm, thus I chose to keep + // the nested loops. + // This should have no impact on performance because the loops are unrolled anyway. + + // load data from global memory + acc_t elements[WARP_BATCH][WARP_ITERATIONS]; + for (int i = 0; i < WARP_BATCH; ++i) { + int batch_element_count = (i >= local_batches) ? 0 : element_count; + for (int it = 0; it < WARP_ITERATIONS; ++it) { + int element_index = local_idx + it * WARP_SIZE; + if (element_index < batch_element_count) { + elements[i][it] = src[i*element_count+it*WARP_SIZE]; + } else { + elements[i][it] = -std::numeric_limits::infinity(); + } + } + } + + // compute max_value + acc_t max_value[WARP_BATCH]; + #pragma unroll + for (int i = 0; i < WARP_BATCH; ++i) { + int batch_element_count = (i >= local_batches) ? 0 : element_count; + bool is_meaningful_max = false; + max_value[i] = elements[i][0]; + #pragma unroll + for (int it = 0; it < WARP_ITERATIONS; ++it) { + if (is_masked) { + int idx = it*WARP_SIZE; + if ((idx + local_idx) < batch_element_count) { + if (!is_transformer_mask) { + idx += i*element_count; + } + if (!mask[idx]) { + max_value[i] = (is_meaningful_max && max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it]; + is_meaningful_max = true; + } + } + } else { + max_value[i] = max_value[i] > elements[i][it] ? max_value[i] : elements[i][it]; + } + } + if (is_masked) { + if (!is_meaningful_max) { + max_value[i] = -std::numeric_limits::infinity(); + } + } + } + warp_reduce(max_value); + + acc_t sum[WARP_BATCH] { 0.0f }; + #pragma unroll + for (int i = 0; i < WARP_BATCH; ++i) { + int batch_element_count = (i >= local_batches) ? 0 : element_count; + #pragma unroll + for (int it = 0; it < WARP_ITERATIONS; ++it) { + if (!is_masked) { + if (is_log_softmax) { + sum[i] += std::exp(elements[i][it] - max_value[i]); + } else { + elements[i][it] = std::exp(elements[i][it] - max_value[i]); + sum[i] += elements[i][it]; + } + } else { + int idx = it*WARP_SIZE; + bool valid = (idx + local_idx) < batch_element_count; + if (!is_transformer_mask) { + idx += i*element_count; + } + if (valid) { + if (!mask[idx]) { + if (is_log_softmax) { + sum[i] += std::exp(elements[i][it] - max_value[i]); + } else { + elements[i][it] = std::exp(elements[i][it] - max_value[i]); + sum[i] += elements[i][it]; + } + } else { + if (!is_log_softmax) { + // Masked values are treated as -infinity, and std::exp(-infinity) is 0. + elements[i][it] = 0; + } + } + } else { + if (!is_log_softmax) { + elements[i][it] = 0.; + } + } + } + } + } + warp_reduce(sum); + + // store result + #pragma unroll + for (int i = 0; i < WARP_BATCH; ++i) { + if (i >= local_batches) + break; + if (is_log_softmax) sum[i] = std::log(sum[i]); + #pragma unroll + for (int it = 0; it < WARP_ITERATIONS; ++it) { + int element_index = local_idx + it * WARP_SIZE; + if (element_index < element_count) { + if (is_log_softmax) { + dst[i*element_count+it*WARP_SIZE] = elements[i][it] - max_value[i] - sum[i]; + } else if (sum[i] == 0) { + dst[i*element_count+it*WARP_SIZE] = std::numeric_limits::quiet_NaN(); + } else { + dst[i*element_count+it*WARP_SIZE] = elements[i][it] / sum[i]; + } + } else { + break; + } + } + } +} + +template +__global__ void softmax_warp_backward(output_t *gradInput, const input_t *grad, const input_t *output, int batch_size, int stride, int element_count, const bool *mask = nullptr) +{ + // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and warp_size of method warp_softmax_backward_kernel. + constexpr int next_power_of_two = 1 << log2_elements; + constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE; + constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE; + constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1; + + int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH; + + // batch_size might not be a multiple of WARP_BATCH. Check how + // many batches have to computed within this WARP. + int local_batches = batch_size - first_batch; + if (local_batches > WARP_BATCH) + local_batches = WARP_BATCH; + + // there might be multiple batches per warp. compute the index within the batch + int local_idx = threadIdx.x % WARP_SIZE; + + // the first element to process by the current thread + int thread_offset = first_batch * stride + local_idx; + grad += thread_offset; + output += thread_offset; + gradInput += thread_offset; + if (is_masked) { + mask += thread_offset; + } + + // The nested loops over WARP_BATCH and then WARP_ITERATIONS can be simplified to one loop, + // but I think doing so would obfuscate the logic of the algorithm, thus I chose to keep + // the nested loops. + // This should have no impact on performance because the loops are unrolled anyway. + + // load data from global memory + acc_t grad_reg[WARP_BATCH][WARP_ITERATIONS]; + acc_t output_reg[WARP_BATCH][WARP_ITERATIONS]; + for (int i = 0; i < WARP_BATCH; ++i) { + int batch_element_count = (i >= local_batches) ? 0 : element_count; + for (int it = 0; it < WARP_ITERATIONS; ++it) { + int element_index = local_idx + it * WARP_SIZE; + if (element_index < batch_element_count) { + grad_reg[i][it] = grad[i*element_count+it*WARP_SIZE]; + output_reg[i][it] = output[i*element_count+it*WARP_SIZE]; + } else { + grad_reg[i][it] = acc_t(0); + output_reg[i][it] = acc_t(0); + } + } + } + + acc_t sum[WARP_BATCH] { 0.0f }; + #pragma unroll + for (int i = 0; i < WARP_BATCH; ++i) { + #pragma unroll + for (int it = 0; it < WARP_ITERATIONS; ++it) { + if (!is_masked || !mask[i*element_count+it*WARP_SIZE]) { + sum[i] += grad_reg[i][it]; + } + } + } + warp_reduce(sum); + + // store result + #pragma unroll + for (int i = 0; i < WARP_BATCH; ++i) { + if (i >= local_batches) + break; + #pragma unroll + for (int it = 0; it < WARP_ITERATIONS; ++it) { + int element_index = local_idx + it * WARP_SIZE; + if (element_index < element_count) { + if (is_masked && mask[i*element_count+it*WARP_SIZE]) { + gradInput[i*element_count+it*WARP_SIZE] = 0; + } + // compute gradients + else if (is_log_softmax) { + gradInput[i*element_count+it*WARP_SIZE] = (grad_reg[i][it] - std::exp(output_reg[i][it]) * sum[i]); + } else { + gradInput[i*element_count+it*WARP_SIZE] = (grad_reg[i][it] - output_reg[i][it] * sum[i]); + } + } + } + } +} + +} // end of anonymous namespace + +template +void dispatch_softmax_forward(output_t *dst, const input_t *src, int softmax_elements, int softmax_elements_stride, int batch_count, const bool *mask = nullptr, int chunk_size = -1, bool is_transformer_mask = false) +{ + TORCH_INTERNAL_ASSERT( softmax_elements >= 0 && softmax_elements <= 1024 ); + if (softmax_elements == 0) { + return; + } else { + int log2_elements = log2_ceil(softmax_elements); + const int next_power_of_two = 1 << log2_elements; + + // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_forward. + int warp_size = at::cuda::warp_size(); + warp_size = (next_power_of_two < warp_size) ? next_power_of_two : warp_size; + + // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_forward. + int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1; + + // use 128 threads per block to maximize gpu utilization + constexpr int threads_per_block = 128; + + int warps_per_block = (threads_per_block / warp_size); + int batches_per_block = warps_per_block * batches_per_warp; + int blocks = (batch_count + batches_per_block - 1) / batches_per_block; + dim3 threads(warp_size, warps_per_block, 1); + // Launch code would be more elegant if C++ supported FOR CONSTEXPR + switch (log2_elements) { + #define LAUNCH_SOFTMAX_WARP_FORWARD(L2E) case L2E: \ + softmax_warp_forward \ + <<>>(dst, \ + src, batch_count, softmax_elements_stride, softmax_elements, mask, chunk_size, is_transformer_mask); \ + C10_CUDA_KERNEL_LAUNCH_CHECK(); \ + break; + + LAUNCH_SOFTMAX_WARP_FORWARD(0); // 1 + LAUNCH_SOFTMAX_WARP_FORWARD(1); // 2 + LAUNCH_SOFTMAX_WARP_FORWARD(2); // 4 + LAUNCH_SOFTMAX_WARP_FORWARD(3); // 8 + LAUNCH_SOFTMAX_WARP_FORWARD(4); // 16 + LAUNCH_SOFTMAX_WARP_FORWARD(5); // 32 + LAUNCH_SOFTMAX_WARP_FORWARD(6); // 64 + LAUNCH_SOFTMAX_WARP_FORWARD(7); // 128 + LAUNCH_SOFTMAX_WARP_FORWARD(8); // 256 + LAUNCH_SOFTMAX_WARP_FORWARD(9); // 512 + LAUNCH_SOFTMAX_WARP_FORWARD(10); ; // 1024 + default: + break; + } + } +} + +template +void dispatch_softmax_backward(output_t *grad_input, const input_t *grad, const input_t *output, int softmax_elements, int softmax_elements_stride, int batch_count, const bool *mask = nullptr) +{ + TORCH_INTERNAL_ASSERT( softmax_elements >= 0 && softmax_elements <= 1024 ); + if (softmax_elements == 0) { + return; + } else { + int log2_elements = log2_ceil(softmax_elements); + const int next_power_of_two = 1 << log2_elements; + + // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_backward. + int warp_size = at::cuda::warp_size(); + warp_size = (next_power_of_two < warp_size) ? next_power_of_two : warp_size; + + // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_backward. + int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1; + + // use 128 threads per block to maximize gpu utilization + constexpr int threads_per_block = 128; + + int warps_per_block = (threads_per_block / warp_size); + int batches_per_block = warps_per_block * batches_per_warp; + int blocks = (batch_count + batches_per_block - 1) / batches_per_block; + dim3 threads(warp_size, warps_per_block, 1); + // Launch code would be more elegant if C++ supported FOR CONSTEXPR + switch (log2_elements) { + #define LAUNCH_SOFTMAX_WARP_BACKWARD(L2E) case L2E: \ + softmax_warp_backward \ + <<>> \ + (grad_input, grad, output, batch_count, softmax_elements_stride, \ + softmax_elements, mask); \ + C10_CUDA_KERNEL_LAUNCH_CHECK(); \ + break; + + LAUNCH_SOFTMAX_WARP_BACKWARD(0); // 1 + LAUNCH_SOFTMAX_WARP_BACKWARD(1); // 2 + LAUNCH_SOFTMAX_WARP_BACKWARD(2); // 4 + LAUNCH_SOFTMAX_WARP_BACKWARD(3); // 8 + LAUNCH_SOFTMAX_WARP_BACKWARD(4); // 16 + LAUNCH_SOFTMAX_WARP_BACKWARD(5); // 32 + LAUNCH_SOFTMAX_WARP_BACKWARD(6); // 64 + LAUNCH_SOFTMAX_WARP_BACKWARD(7); // 128 + LAUNCH_SOFTMAX_WARP_BACKWARD(8); // 256 + LAUNCH_SOFTMAX_WARP_BACKWARD(9); // 512 + LAUNCH_SOFTMAX_WARP_BACKWARD(10); // 1024 + default: + break; + } + } +} diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Randperm.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Randperm.cuh new file mode 100644 index 0000000000000000000000000000000000000000..de5affebb8bd5bf065502b3af31a80dc9c562991 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Randperm.cuh @@ -0,0 +1,58 @@ +#include +#include +#include + +#include +#include +#include + +namespace { + +// See note [Algorithm of randperm] +template +__global__ void randperm_handle_duplicate_keys_kernel(T *keys, scalar_t *data, T mask, int n, at::PhiloxCudaState philox_args) { + int tid = threadIdx.x + blockDim.x * blockIdx.x; + + // find the beginning of islands + if (tid >= n - 1) return; // out of range + if ((keys[tid] & mask) != (keys[tid + 1] & mask)) return; // not in an island + if (tid != 0 && (keys[tid] & mask) == (keys[tid - 1] & mask)) return; // not the beginning of an island + + // find the size of islands + int island_size = 0; + do { island_size++; } + while ((tid + island_size < n) && (keys[tid + island_size] & mask) == (keys[tid] & mask)); + + // do random permutation inside each island. + data += tid; + auto seeds = at::cuda::philox::unpack(philox_args); + curandStatePhilox4_32_10_t state; + curand_init(std::get<0>(seeds), tid, std::get<1>(seeds), &state); + for (int i = island_size - 1; i > 0; i--) { + unsigned int r = curand(&state) % (i + 1); + if (i != r) { + scalar_t tmp = data[i]; + data[i] = data[r]; + data[r] = tmp; + } + } +} + +// See note [Algorithm of randperm] +template +void randperm_handle_duplicate_keys(T *keys, scalar_t *data, int bits, int64_t n, c10::optional &gen_) { + auto gen = at::get_generator_or_default(gen_, at::cuda::detail::getDefaultCUDAGenerator()); + int64_t counter_offset = n; + at::PhiloxCudaState rng_engine_inputs; + { + // See Note [Acquire lock when using random generators] + std::lock_guard lock(gen->mutex_); + rng_engine_inputs = gen->philox_cuda_state(counter_offset); + } + T mask = static_cast((1UL << bits) - 1); + randperm_handle_duplicate_keys_kernel<<<(n + 511) / 512, 512, 0, at::cuda::getCurrentCUDAStream()>>>( + keys, data, mask, n, rng_engine_inputs); + C10_CUDA_KERNEL_LAUNCH_CHECK(); +} + +} diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Reduce.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Reduce.cuh new file mode 100644 index 0000000000000000000000000000000000000000..1f67ee3ea63e1035688da33888b7b5c7b820e3ef --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Reduce.cuh @@ -0,0 +1,1354 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace at { namespace native { + +using at::detail::Array; + +static inline int64_t div_up(int64_t a, int64_t b) { + return (a + b - 1) / b; +} + +// returns floor(log2(n)) +static inline int last_pow2(int n) { + n |= (n >> 1); + n |= (n >> 2); + n |= (n >> 4); + n |= (n >> 8); + n |= (n >> 16); + return std::max(1, n - (n >> 1)); +} + +// returns reduced fraction numerator & denominator +C10_HOST_DEVICE static void reduce_fraction(size_t &numerator, size_t &denominator) { + // get GCD of num and denom using Euclid's algorithm. + // Can replace this with std::gcd if we ever support c++17. + size_t a = denominator; + size_t b = numerator; + while (b != 0) { + a %= b; + // swap(a,b) + size_t tmp = a; + a = b; + b = tmp; + } + + // a is now the GCD + numerator /= a; + denominator /= a; +} + +//template for changing MAX_NUM_THREADS based on op dtype +template +struct mnt_wrapper { + static constexpr int MAX_NUM_THREADS = 512; +}; + +template <> +struct mnt_wrapper >{ + static constexpr int MAX_NUM_THREADS = 256; +}; + +constexpr int max_reduce_threads(c10::ScalarType type) { + return type == kComplexDouble ? 256 : 512; +} + +struct ReduceConfig { + static constexpr int BLOCK_X = 0; + static constexpr int BLOCK_Y = 1; + static constexpr int CTA = 2; + + static constexpr int input_vec_size = 4; + + ReduceConfig(int element_size_bytes, int num_outputs, int num_inputs) + : element_size_bytes(element_size_bytes) + , num_inputs(num_inputs) + , num_outputs(num_outputs) {} + int element_size_bytes; + int num_inputs; + int num_outputs; + int step_input = 1; + int step_output = 1; + int ctas_per_output = 1; + int input_mult[3] = {0, 0, 0}; + int output_mult[2] = {0, 0}; + + int block_width; + int block_height; + int num_threads; + + bool vectorize_input = false; + int output_vec_size = 1; + + template + void set_block_dimension(int64_t dim0, int64_t dim1) { + const int max_num_threads = mnt_wrapper::MAX_NUM_THREADS / output_vec_size; + int dim0_pow2 = dim0 < max_num_threads ? static_cast(last_pow2(dim0)) : max_num_threads; + int dim1_pow2 = dim1 < max_num_threads ? static_cast(last_pow2(dim1)) : max_num_threads; + block_width = std::min(dim0_pow2, int(at::cuda::warp_size())); + block_height = std::min(dim1_pow2, int(max_num_threads / block_width)); + block_width = std::min(dim0_pow2, int(max_num_threads / block_height)); + num_threads = block_width * block_height; + } + + int split_input(int parallelism) { + int step = step_input; + step_input *= parallelism; + return step; + } + + int split_output(int parallelism) { + int step = step_output; + step_output *= parallelism; + return step; + } + + dim3 block() const { + return dim3(block_width, block_height); + } + + dim3 grid() const { + return dim3(div_up(num_outputs / output_vec_size, step_output), ctas_per_output); + } + + C10_HOST_DEVICE bool should_block_x_reduce() const { + return input_mult[BLOCK_X] != 0; + } + + C10_HOST_DEVICE bool should_block_y_reduce() const { + return input_mult[BLOCK_Y] != 0; + } + + C10_HOST_DEVICE bool should_global_reduce() const { + return input_mult[CTA] != 0; + } + + C10_DEVICE bool should_store(int output_idx) const { + return output_idx < num_outputs && + (!should_block_x_reduce() || threadIdx.x == 0) && + (!should_block_y_reduce() || threadIdx.y == 0); + } + + C10_DEVICE bool should_reduce_tail() const { + return (!should_block_y_reduce() || threadIdx.y == 0) && + (!should_global_reduce() || blockIdx.y == 0); + } + + C10_HOST_DEVICE int input_idx() const { + int lane = threadIdx.x; + int warp = threadIdx.y; + int cta2 = blockIdx.y; + return (lane * input_mult[BLOCK_X] + + warp * input_mult[BLOCK_Y] + + cta2 * input_mult[CTA]); + } + + template + C10_HOST_DEVICE int output_idx() const { + int lane = threadIdx.x; + int warp = threadIdx.y; + int cta1 = blockIdx.x; + return (lane * output_mult[BLOCK_X] + + warp * output_mult[BLOCK_Y] + + cta1 * step_output) * output_vec_size; + } + + C10_DEVICE int shared_memory_offset(int offset) const { + return threadIdx.x + (threadIdx.y + offset) * blockDim.x; + } + + C10_DEVICE int staging_memory_offset(int cta2) const { + int offset = cta2 + blockIdx.x * gridDim.y; + if (!should_block_x_reduce()) { + offset = threadIdx.x + offset * blockDim.x; + } + return offset; + } + + int shared_memory_size() const { + if (!should_block_y_reduce() && + (!should_block_x_reduce() || + block_width <= at::cuda::warp_size())) { + return 0; + } + return element_size_bytes * num_threads * output_vec_size; + } + + int64_t global_memory_size() const { + if (!should_global_reduce()) { + return 0; + } + auto size = (int64_t)element_size_bytes * num_outputs * ctas_per_output; + if (!should_block_x_reduce()) { + size *= block().x * output_vec_size; + } + return size; + } + + int semaphore_size() const { + if (!should_global_reduce()) { + return 0; + } + return sizeof(int) * grid().x; + } + + int values_per_thread() const { + return div_up(num_inputs, step_input); + } +}; + +std::ostream& operator<<(std::ostream& out, const ReduceConfig& config); + +template +C10_LAUNCH_BOUNDS_2(nt, 4) +__global__ void reduce_kernel(R reduction) { + reduction.template run(); +} + +template +static OffsetCalculator<2, index_t> make_output_calculator(const TensorIterator& iter) { + int num_reduce_dims = iter.num_reduce_dims(); + int num_output_dims = iter.ndim() - num_reduce_dims; + int input_index = iter.ntensors() - 1; + int output_index = 0; + std::array strides = { + iter.strides(output_index).data() + num_reduce_dims, + iter.strides(input_index).data() + num_reduce_dims, + }; + auto shape = iter.shape().data() + num_reduce_dims; + return OffsetCalculator<2, index_t>(num_output_dims, shape, strides.data()); +} + +template +static OffsetCalculator<1, index_t> make_input_calculator(const TensorIterator& iter) { + int num_reduce_dims = iter.num_reduce_dims(); + int input_index = iter.ntensors() - 1; + std::array strides = { + iter.strides(input_index).data(), + }; + return OffsetCalculator<1, index_t>(num_reduce_dims, iter.shape().data(), strides.data()); +} + +template +struct func_wrapper_t { + using arg_t = typename binary_function_traits::arg1_t; + using scalar_t = typename binary_function_traits::arg2_t; + + func_t combine; + static inline __device__ out_scalar_t project(arg_t arg) { + return (out_scalar_t) arg; + } + static inline __device__ arg_t warp_shfl_down(arg_t arg, int offset) { + return WARP_SHFL_DOWN(arg, offset); + } + + static __device__ arg_t translate_idx(arg_t acc, int64_t /*idx*/) { + return acc; + } + + func_wrapper_t(const func_t& op) : combine(op) { + } + + // wrap a normal reduction that ignores the index + __device__ arg_t reduce(arg_t acc, scalar_t val, int64_t idx) const { + return combine(acc, val); + } +}; + +template +func_wrapper_t func_wrapper(const func_t& op) { + return func_wrapper_t { op }; +} + +template +struct ReduceJitOp { +//ReduceJitOp is almost like ReduceOp, but it doesn't have ops functor that specifies reduction operations +//Maybe we can find a way to unify ReduceOp and ReduceJitOp + using InputCalculator = OffsetCalculator<1, uint32_t>; + using OutputCalculator = OffsetCalculator<2, uint32_t>; + //TODO for now arg_t is always opmath_t of the input, later we'll need to change it + using arg_t = at::opmath_type; + + static constexpr int input_vec_size = ReduceConfig::input_vec_size; + //TODO - ReduceJitOp will probably need to be changed for reductions that need full functor, + //not just wrapper + arg_t ident; + ReduceConfig config; + InputCalculator input_calc; + OutputCalculator output_calc; + const void* src; + const char* dst[2]; //it accepts at most two destinations + // acc_buf used for accumulation among sub Tensor Iterator when accumulation on + // output is not permissible + void* acc_buf; + // cta_buf used for accumulation between blocks during global reduction + void* cta_buf; + int* semaphores; + int64_t base_idx; + bool accumulate; + bool final_output; + int noutputs; + + ReduceJitOp( + ReduceConfig config, + InputCalculator input_calc, + OutputCalculator output_calc, + const void* src, + char* dst0, + optional dst1, + void* acc_buf, + void* cta_buf, + int* semaphores, + arg_t ident, + int noutputs, + int64_t base_idx) + : ident(ident), + config(config), + input_calc(input_calc), + output_calc(output_calc), + src(src), + acc_buf(acc_buf), + cta_buf(cta_buf), + semaphores(semaphores), + base_idx(base_idx), + noutputs(noutputs) { + dst[0] = dst0; + if (dst1.has_value()) { + dst[1] = dst1.value(); + } + } +}; + +template +struct ReduceOp { + using traits = function_traits; + using arg_t = typename std::decay::type>::type; + + using InputCalculator = OffsetCalculator<1, index_t>; + using OutputCalculator = OffsetCalculator<2, index_t>; + + static constexpr bool can_accumulate_in_output = + std::is_convertible::value + && std::is_convertible::value; + + static constexpr int input_vec_size = ReduceConfig::input_vec_size; + + ops_t ops; + arg_t ident; + ReduceConfig config; + InputCalculator input_calc; + OutputCalculator output_calc; + const void* src; + const char* dst[2]; //it accepts at most two destinations + // acc_buf used for accumulation among sub Tensor Iterator when accumulation on + // output is not permissible + void* acc_buf; + // cta_buf used for accumulation between blocks during global reduction + void* cta_buf; + int* semaphores; + int64_t base_idx; + bool accumulate; + bool final_output; + int noutputs; + + ReduceOp( + ops_t ops, + ReduceConfig config, + InputCalculator input_calc, + OutputCalculator output_calc, + const void* src, + char* dst0, + optional dst1, + void* acc_buf, + void* cta_buf, + int* semaphores, + arg_t ident, + int noutputs, + int64_t base_idx) + : ops(ops), + ident(ident), + config(config), + input_calc(input_calc), + output_calc(output_calc), + src(src), + acc_buf(acc_buf), + cta_buf(cta_buf), + semaphores(semaphores), + base_idx(base_idx), + noutputs(noutputs) { + dst[0] = dst0; + if (dst1.has_value()) { + dst[1] = dst1.value(); + } + } + + template + C10_DEVICE void run() const { + extern __shared__ char shared_memory[]; + index_t output_idx = config.output_idx(); + index_t input_idx = config.input_idx(); + auto base_offsets1 = output_calc.get(output_idx)[1]; + + using arg_vec_t = at::detail::Array; + arg_vec_t value; + + if (output_idx < config.num_outputs && input_idx < config.num_inputs) { + const scalar_t* input_slice = (const scalar_t*)((const char*)src + base_offsets1); + value = thread_reduce(input_slice); + } + + if (config.should_block_y_reduce()) { + value = block_y_reduce(value, shared_memory); + } + if (config.should_block_x_reduce()) { + value = block_x_reduce(value, shared_memory); + } + + using out_ptr_vec_t = at::detail::Array; + using offset_vec_t = at::detail::Array; + offset_vec_t base_offsets; + out_ptr_vec_t out; + + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + base_offsets[i] = output_calc.get(output_idx + i)[0]; + out[i] = (out_scalar_t*)((char*)dst[0] + base_offsets[i]); + } + + arg_vec_t* acc = nullptr; + if (acc_buf != nullptr) { + size_t numerator = sizeof(arg_t); + size_t denominator = sizeof(out_scalar_t); + reduce_fraction(numerator, denominator); + acc = (arg_vec_t*)((char*)acc_buf + (base_offsets[0] * numerator / denominator)); + } + + if (config.should_global_reduce()) { + value = global_reduce(value, acc, shared_memory); + } else if (config.should_store(output_idx)) { + if (accumulate) { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = ops.translate_idx(value[i], base_idx); + } + } + + if (acc == nullptr) { + if (accumulate) { + value = accumulate_in_output(out, value); + } + if (final_output) { + set_results_to_output(value, base_offsets); + } else { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + *(out[i]) = get_accumulated_output(out[i], value[i]); + } + } + } else { + if (accumulate) { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = ops.combine((*acc)[i], value[i]); + } + } + if (final_output) { + set_results_to_output(value, base_offsets); + } else { + *acc = value; + } + } + } + } + + template + C10_DEVICE at::detail::Array thread_reduce(const scalar_t* data) const { + if (config.vectorize_input) { + CUDA_KERNEL_ASSERT(output_vec_size == 1); + // reduce at the header of input_slice where memory is not aligned, + // so that thread_reduce will have an aligned memory to work on. + return {input_vectorized_thread_reduce_impl(data)}; + } else { + index_t element_stride = input_calc.strides_[0][0] / sizeof(scalar_t); + bool is_contiguous = (input_calc.dims == 1 && element_stride == 1); + if (is_contiguous) { + return thread_reduce_impl(data, [](index_t idx) { return idx; }); + } else if (input_calc.dims == 1) { + return thread_reduce_impl(data, [&](index_t idx) { return idx * element_stride; }); + } else { + return thread_reduce_impl(data, [&](index_t idx) { return input_calc.get(idx)[0] / sizeof(scalar_t); }); + } + } + } + + C10_DEVICE arg_t input_vectorized_thread_reduce_impl(const scalar_t* data) const { + index_t end = config.num_inputs; + + // Handle the head of input slice where data is not aligned + arg_t value = ident; + constexpr int align_bytes = alignof(at::native::memory::aligned_vector); + constexpr int align_elements = align_bytes / sizeof(scalar_t); + int shift = ((uint64_t)data) % align_bytes / sizeof(scalar_t); + if (shift > 0) { + data -= shift; + end += shift; + if(threadIdx.x >= shift && threadIdx.x < align_elements && config.should_reduce_tail()){ + value = ops.reduce(value, c10::load(data + threadIdx.x), threadIdx.x - shift); + } + end -= align_elements; + data += align_elements; + shift = align_elements - shift; + } + + // Do the vectorized reduction + using load_t = at::native::memory::aligned_vector; + + index_t idx = config.input_idx(); + const index_t stride = config.step_input; + + // Multiple accumulators to remove dependency between unrolled loops. + arg_t value_list[input_vec_size]; + value_list[0] = value; + + #pragma unroll + for (int i = 1; i < input_vec_size; i++) { + value_list[i] = ident; + } + + while (idx * input_vec_size + input_vec_size - 1 < end) { + const auto values_vec = memory::load_vector(data, idx); + #pragma unroll + for (index_t i = 0; i < input_vec_size; i++) { + value_list[i] = ops.reduce(value_list[i], values_vec.val[i], shift + idx * input_vec_size + i); + } + idx += stride; + } + + // tail + index_t tail_start = end - end % input_vec_size; + if (config.should_reduce_tail()) { + int idx = tail_start + threadIdx.x; + if (idx < end) { + const auto value = c10::load(data + idx); + value_list[0] = ops.reduce(value_list[0], value, idx + shift); + } + } + + // combine accumulators + #pragma unroll + for (int i = 1; i < input_vec_size; i++) { + value_list[0] = ops.combine(value_list[0], value_list[i]); + } + return value_list[0]; + } + + template + C10_DEVICE at::detail::Array thread_reduce_impl(const scalar_t* data_, offset_calc_t calc) const { + index_t idx = config.input_idx(); + const index_t end = config.num_inputs; + const index_t stride = config.step_input; + + using arg_vec_t = at::detail::Array; + using load_t = at::native::memory::aligned_vector; + + // Multiple accumulators to remove dependency between unrolled loops. + arg_vec_t value_list[vt0]; + + #pragma unroll + for (int i = 0; i < vt0; i++) { + #pragma unroll + for (int j = 0; j < output_vec_size; j++) { + value_list[i][j] = ident; + } + } + + load_t values[vt0]; + + while (idx + (vt0 - 1) * stride < end) { + #pragma unroll + for (index_t i = 0; i < vt0; i++) { + const auto offset = calc(idx + i * stride) / output_vec_size; + values[i] = memory::load_vector(data_, offset); + } + #pragma unroll + for (index_t i = 0; i < vt0; i++) { + #pragma unroll + for (index_t j = 0; j < output_vec_size; j++) { + value_list[i][j] = ops.reduce(value_list[i][j], values[i].val[j], idx + i * stride); + } + } + idx += stride * vt0; + } + + // tail + int idx_ = idx; + #pragma unroll + for (index_t i = 0; i < vt0; i++) { + if (idx >= end) { + break; + } + const auto offset = calc(idx) / output_vec_size; + values[i] = memory::load_vector(data_, offset); + idx += stride; + } + idx = idx_; + #pragma unroll + for (index_t i = 0; i < vt0; i++) { + if (idx >= end) { + break; + } + #pragma unroll + for (index_t j = 0; j < output_vec_size; j++) { + value_list[i][j] = ops.reduce(value_list[i][j], values[i].val[j], idx); + } + idx += stride; + } + + // combine accumulators + #pragma unroll + for (int i = 1; i < vt0; i++) { + #pragma unroll + for (index_t j = 0; j < output_vec_size; j++) { + value_list[0][j] = ops.combine(value_list[0][j], value_list[i][j]); + } + } + return value_list[0]; + } + + template + C10_DEVICE at::detail::Array block_x_reduce(at::detail::Array value, char* shared_memory) const { + using args_vec_t = at::detail::Array; + int dim_x = blockDim.x; + args_vec_t* shared = (args_vec_t*)shared_memory; + if (dim_x > warpSize) { + int address_base = threadIdx.x + threadIdx.y*blockDim.x; + shared[address_base] = value; + for (int offset = dim_x/2; offset >= warpSize; offset >>= 1) { + __syncthreads(); + if (threadIdx.x < offset && threadIdx.x + offset < blockDim.x) { + args_vec_t other = shared[address_base + offset]; + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = ops.combine(value[i], other[i]); + } + shared[address_base] = value; + } + } + dim_x = warpSize; + } + + __syncthreads(); + + for (int offset = 1; offset < dim_x; offset <<= 1) { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + arg_t other = ops.warp_shfl_down(value[i], offset); + value[i] = ops.combine(value[i], other); + } + } + return value; + } + + template + C10_DEVICE at::detail::Array block_y_reduce(at::detail::Array value, char* shared_memory) const { + using args_vec_t = at::detail::Array; + args_vec_t* shared = (args_vec_t*)shared_memory; + shared[config.shared_memory_offset(0)] = value; + for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) { + __syncthreads(); + if (threadIdx.y < offset && threadIdx.y + offset < blockDim.y) { + args_vec_t other = shared[config.shared_memory_offset(offset)]; + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = ops.combine(value[i], other[i]); + } + shared[config.shared_memory_offset(0)] = value; + } + } + return value; + } + + C10_DEVICE bool mark_block_finished() const { + __shared__ bool is_last_block_done_shared; + + __syncthreads(); + if (threadIdx.x == 0 && threadIdx.y == 0) { + int prev_blocks_finished = atomicAdd(&semaphores[blockIdx.x], 1); + is_last_block_done_shared = (prev_blocks_finished == gridDim.y - 1); + } + + __syncthreads(); + + return is_last_block_done_shared; + } + + template + C10_DEVICE at::detail::Array accumulate_in_output( + at::detail::Array out, + at::detail::Array value, + typename std::enable_if::type* = nullptr + ) const { + at::detail::Array ret; + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + ret[i] = ops.combine(*(out[i]), value[i]); + } + return ret; + } + + template + C10_DEVICE out_scalar_t get_accumulated_output( + out_scalar_t* out, arg_t value, + typename std::enable_if::type* = nullptr + ) const { + CUDA_KERNEL_ASSERT(!final_output); + return (out_scalar_t)value; + } + + // This function should never be called -- + // it's the version of `accumulate_in_output` + // when accumulation in the output is not possible. + template + C10_DEVICE at::detail::Array accumulate_in_output( + at::detail::Array, + at::detail::Array, + typename std::enable_if::type* = nullptr + ) const { + CUDA_KERNEL_ASSERT(false); + return arg_t {}; + } + + // This function should never be called -- + // it's the version of `get_accumulated_output` + // when accumulation in the output is not possible. + template + C10_DEVICE out_scalar_t get_accumulated_output( + out_scalar_t* out, arg_t value, + typename std::enable_if::type* = nullptr + ) const { + CUDA_KERNEL_ASSERT(false); + return *out; + } + + template + C10_DEVICE void set_results(const T x, const index_t base_offset) const { + CUDA_KERNEL_ASSERT(noutputs == 1); + auto res = (out_scalar_t*)((char*)dst[0] + base_offset); + *res = x; + } + + //Currently implemented for max of two outputs + template + C10_DEVICE void set_results(const thrust::pair x, const index_t base_offset) const { + if (noutputs >= 1) { + auto res0 = (T1*)((char*)dst[0] + base_offset); + *res0 = x.first; + } + if (noutputs >= 2) { + // base offset is computed assuming element size being sizeof(T1), so we need to make a + // correction to obtain the correct base offset + auto res1 = (T2*) ((char *) dst[1] + base_offset / sizeof(T1) * sizeof(T2)); + *res1 = x.second; + } + } + + template + C10_DEVICE void set_results_to_output(at::detail::Array value, at::detail::Array base_offset) const { + CUDA_KERNEL_ASSERT(final_output); + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + set_results(ops.project(value[i]), base_offset[i]); + } + } + + template + C10_DEVICE at::detail::Array global_reduce(at::detail::Array value, at::detail::Array *acc, char* shared_memory) const { + using arg_vec_t = at::detail::Array; + using out_ptr_vec_t = at::detail::Array; + using offset_vec_t = at::detail::Array; + + arg_vec_t* reduce_buffer = (arg_vec_t*)cta_buf; + index_t output_idx = config.output_idx(); + offset_vec_t base_offsets; + out_ptr_vec_t out; + + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + base_offsets[i] = output_calc.get(output_idx + i)[0]; + out[i] = (out_scalar_t*)((char*)dst[0] + base_offsets[i]); + } + + bool should_store = config.should_store(output_idx); + if (should_store) { + index_t offset = config.staging_memory_offset(blockIdx.y); + reduce_buffer[offset] = value; + } + + __threadfence(); // make sure writes are globally visible + __syncthreads(); // if multiple warps in this block wrote to staging, make sure they're all done + bool is_last_block_done = mark_block_finished(); + + if (is_last_block_done) { + value = ident; + if (config.should_block_x_reduce()) { + index_t input_offset = threadIdx.x + threadIdx.y * blockDim.x; + index_t step = blockDim.x * blockDim.y; + for (; input_offset < config.ctas_per_output; input_offset += step) { + index_t idx = config.staging_memory_offset(input_offset); + arg_vec_t next = reduce_buffer[idx]; + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = ops.combine(value[i], next[i]); + } + } + } else { + index_t input_offset = threadIdx.y; + index_t step = blockDim.y; + for (; input_offset < config.ctas_per_output; input_offset += step) { + index_t idx = config.staging_memory_offset(input_offset); + arg_vec_t next = reduce_buffer[idx]; + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = ops.combine(value[i], next[i]); + } + } + } + value = block_y_reduce(value, shared_memory); + if (config.should_block_x_reduce()) { + value = block_x_reduce(value, shared_memory); + } + if (should_store) { + if (accumulate) { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = ops.translate_idx(value[i], base_idx); + } + } + + if (acc == nullptr) { + if (accumulate) { + value = accumulate_in_output(out, value); + } + if (final_output) { + set_results_to_output(value, base_offsets); + } else { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + *(out[i]) = get_accumulated_output(out[i], value[i]); + } + } + } else { + if (accumulate) { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = ops.combine((*acc)[i], value[i]); + } + } + if (final_output) { + set_results_to_output(value, base_offsets); + } else { + *acc = value; + } + } + } + } + + return value; + } +}; + +template +static void launch_reduce_kernel(const ReduceConfig& config, const R& reduction) { + dim3 block = config.block(); + dim3 grid = config.grid(); + + auto stream = at::cuda::getCurrentCUDAStream(); + int shared_memory = config.shared_memory_size(); + + switch(config.output_vec_size) { + case 4: + reduce_kernel<<>>(reduction); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + break; + case 2: + reduce_kernel<<>>(reduction); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + break; + default: + reduce_kernel<<>>(reduction); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } +} + +inline void launch_jitted_reduce_kernel( + std::mutex &jiterator_mutex, + std::array &fn_cache, + const at::cuda::jit::KernelDescriptor &desc, + int vt0, const ReduceConfig& config, void *reduction) { + dim3 block = config.block(); + dim3 grid = config.grid(); + + int shared_memory = config.shared_memory_size(); + at::cuda::jit::NvrtcFunction* fn_ptr; + switch(config.output_vec_size) { + case 4: + fn_ptr = &fn_cache[0]; + break; + case 2: + fn_ptr = &fn_cache[1]; + break; + default: + fn_ptr = &fn_cache[2]; + } + if (!fn_ptr->function) { + int max_threads_codegen = + max_reduce_threads(desc.f_inputs_type) / config.output_vec_size; + auto code = at::cuda::jit::generate_reduction_code( + desc, vt0, true, false, config.output_vec_size, max_threads_codegen); + + *fn_ptr = at::cuda::jit::jit_pwise_function(code, "reduction_" + desc.name); + } + constexpr int kernel_args = 1; + void* args[kernel_args]; + args[0] = reduction; + at::cuda::jit::launch_jitted_pwise_function(*fn_ptr, args, grid, block, shared_memory); +} + + +class AccumulationBuffer { + public: + AccumulationBuffer() {} + + AccumulationBuffer(size_t acc_t_size, size_t out_t_size, char* out_ptr, int64_t size) { + out_ptr_ = (char*)out_ptr; + if (out_t_size >= acc_t_size) { + // reusing output buffer for accumulation. + acc_ptr_ = (char*)out_ptr; + numerator_ = 1; + denominator_ = 1; + } else { + auto& allocator = *c10::cuda::CUDACachingAllocator::get(); + buffer_ = allocator.allocate(size); + acc_ptr_ = (char*)buffer_.get(); + numerator_ = acc_t_size; + denominator_ = out_t_size; + reduce_fraction(numerator_, denominator_); + } + } + + char* get_acc_slice(char* out_ptr) { + if (acc_ptr_ == nullptr) { + return nullptr; + } + return acc_ptr_ + ((out_ptr - out_ptr_) * numerator_ / denominator_); + } + + private: + char* acc_ptr_ = nullptr; + char* out_ptr_ = nullptr; + size_t numerator_; + size_t denominator_; + at::DataPtr buffer_; +}; + +template +int get_output_vec_size(const TensorIterator &iter) { + int vec_size = 4; + auto update_vec_size = [&vec_size](uint64_t n) { + while(n % vec_size != 0) { + vec_size /= 2; + } + }; + + uint64_t base_address = reinterpret_cast(iter.data_ptr(iter.noutputs())) / sizeof(scalar_t); + update_vec_size(base_address); + + const int output_index = iter.num_reduce_dims(); + update_vec_size(iter.shape()[output_index]); + + int j = 0; + for(auto i : iter.strides(iter.noutputs())) { + if (j != output_index) { + update_vec_size(i / sizeof(scalar_t)); + } + j++; + } + return vec_size; +} + +template +ReduceConfig setReduceConfig(const TensorIterator& iter){ + // Start by assuming that each thread handles a single output and all + // the inputs for that output. + int64_t num_outputs = iter.num_output_elements(); + int64_t inputs_per_output = iter.numel() / num_outputs; + int input_index = iter.ntensors() - 1; + + auto config = ReduceConfig(sizeof(arg_t), num_outputs, inputs_per_output); + + int64_t dim0; + int64_t dim1; + int64_t fastest_moving_stride; + bool reduction_on_fastest_striding_dimension; + + if (iter.ndim() > 0) { + // Adjust block size to map block width to fastest changing dimension of input + // tensor. This grants the best possible memory accessing pattern, given that + // for non-contiguous tensor with space in between, we cannot have perfect + // memory coalescing. + reduction_on_fastest_striding_dimension = + (iter.num_reduce_dims() == iter.ndim()) || + (iter.strides(/*arg=*/input_index)[0] < + iter.strides(/*arg=*/input_index)[iter.num_reduce_dims()]); + // Notice that dim0 & dim1 does NOT guarantee any launch configuration here! + // dim0 & dim1 are more like the upper bound of the block dimension. The + // actual launch config and reduction scheme is determined by setting values + // to `config.input_mult` and `config.output_mult`. + // We try to max out dim1 so that we have enough threads per CTA to deliver + // performance for larger problem size. + if (reduction_on_fastest_striding_dimension) { + // Map block.x to the fastest reducing dimension. It implies: + // 1. block_x_reduce is required. + // 2. block.y now max out to num_outputs. + dim0 = inputs_per_output; + dim1 = num_outputs; + fastest_moving_stride = iter.strides(/*arg=*/input_index)[0]; + } else { + // Map block.x to the fastest non reducing dimension. It implies: + // 1. block_x_reduce is turned off. + // 2. block.y now max out to inputs_per_output. + dim0 = num_outputs; + dim1 = inputs_per_output; + fastest_moving_stride = iter.strides(/*arg=*/input_index)[iter.num_reduce_dims()]; + } + } else { + reduction_on_fastest_striding_dimension = true; + fastest_moving_stride = sizeof(scalar_t); + dim0 = 1; + dim1 = 1; + } + + // We do vectorization to gain better memory access, there are two cases which we call + // "vectorize along input" and "vectorize along output". Note that the "input/output" + // here does not mean we are vectorizing load/store instructions. We always only vectorize + // load instructions. + // + // Case 1: "vectorize along input" + // This case happens when we are reducing along fastest moving dimesion. In such case, threads + // with the same threadIdx.y works on the same reduction cooperatively and will produce results + // for the same output. In such case, values in each loaded vector always correspond to the same output. + // + // Case 2: "vectorize along output" + // This case happens when the fastest moving dimesion is not the dimension of reduction. In such case, + // threads with different threadIdx.x are independent and will produce results for different outputs. + // In such case, values in each loaded vector always correspond to different outputs. + if (fastest_moving_stride == sizeof(scalar_t)) { + if (reduction_on_fastest_striding_dimension && dim0 > 128 && iter.num_reduce_dims() == 1 && vt0 >= ReduceConfig::input_vec_size) { + // Case 1: "vectorize along input" + // Note that if vt0 < ReduceConfig::vec_size, then this means the register pressure could be high, in such case, + // we should avoid vectorization. + config.vectorize_input = true; + dim0 /= config.input_vec_size; + } else if (!reduction_on_fastest_striding_dimension) { + // Case 2: "vectorize along output" + config.output_vec_size = get_output_vec_size(iter); + dim0 /= config.output_vec_size; + } + } + + // Adjust block_width and block_height + config.set_block_dimension(dim0, dim1); + + int block_width = config.block_width; + int block_height = config.block_height; + + if (iter.ndim() == 0 || reduction_on_fastest_striding_dimension) { + // Split the input across lanes if the input is contiguous in the reduced + // dimension. This will require reduction between threads using warp + // shuffle instructions and shared memory (if block_width > warpSize). + config.input_mult[0] = config.split_input(block_width); + } else { + // Otherwise split the output across lanes in a warp. + config.output_mult[0] = config.split_output(block_width); + } + + constexpr int min_values_per_thread = 16; + constexpr int max_values_per_thread = 256; + + if (config.values_per_thread() >= block_height * 16 || config.values_per_thread() >= max_values_per_thread) { + // Divide the input across warps in a thread-block, if that leaves at least + // 16 elements to be summed by each thread. This will require inter-warp + // reduction using shared memory. + config.input_mult[1] = config.split_input(block_height); + } else { + // Otherwise, each warp handles a separate output. + config.output_mult[1] = config.split_output(block_height); + } + + const int blocks_per_sm = at::cuda::getCurrentDeviceProperties()->maxThreadsPerMultiProcessor / config.num_threads; + const int num_mp = at::cuda::getCurrentDeviceProperties()->multiProcessorCount; + const int target_grid_size = num_mp * blocks_per_sm; + int grid = config.grid().x; + if (config.input_mult[1] != 0 && config.values_per_thread() >= max_values_per_thread && grid <= target_grid_size) { + // Divide the input across thread-blocks if the amount of work per-thread + // is large enough and the size of the output is small enough. This will + // require a reduction using global memory. + // If we decide to split input across blocks, as long as we can get enough + // number of blocks (`target_grid_size`) to balance SM, we should still + // make the number of values per thread large for best performance. + int ctas_per_output1 = div_up(target_grid_size, grid); + int ctas_per_output2 = div_up(config.values_per_thread(), min_values_per_thread); + int ctas_per_output3 = div_up(config.values_per_thread(), max_values_per_thread); + // We want the minimum of ctas_per_output1 and ctas_per_output2, so that each thread can have + // a large number of values to deal with. But we don't want values_per_thread to be larger than + // max_values_per_thread + config.ctas_per_output = std::max(std::min(ctas_per_output1, ctas_per_output2), ctas_per_output3); + if (config.ctas_per_output > 1) { + config.input_mult[2] = config.split_input(config.ctas_per_output); + } + } + return config; +}; + +template +inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t ident=0, + AccumulationBuffer* acc_buf_ptr=nullptr, int64_t base_idx=0) { + AT_ASSERT(iter.numel() > 0 && iter.ntensors() - iter.noutputs() == 1 && iter.noutputs() >= 1); + + using traits = function_traits; + using arg_t = typename traits::template arg<0>::type; + // at::Half/at::ComplexHalf overflows easily as it's range is very small. + // So when scalar_t and out_scalar_t are at::Half/at::ComplexHalf, we + // set can_accumulate_in_output to False. + static constexpr bool is_inp_out_type_half_or_chalf = + (std::is_same::value && + std::is_same::value) || + (std::is_same, scalar_t>::value && + std::is_same, out_scalar_t>::value); + // at::BFloat16 has lower precision and can lead to rounding errors. + // So when scalar_t and out_scalar_t are at::BFloat16, we + // set can_accumulate_in_output to False. + static constexpr bool is_inp_out_type_bfloat16 = + (std::is_same::value && + std::is_same::value); + static constexpr bool can_accumulate_in_output = + std::is_convertible::value && + !(is_inp_out_type_half_or_chalf || is_inp_out_type_bfloat16); + + bool can_use_32bit_indexing = iter.can_use_32bit_indexing(); + std::unique_ptr owned_buf_ptr; + // The acc_buf_ptr is a shared pointer. It is create at the first entrance and + // reused by all recursive function calls. + if (acc_buf_ptr == NULL) { + // acc_buf_ptr holds buffer used for accumulation among multiple sub_iter + // when accumulation in output is not possible. + if (!can_accumulate_in_output && !can_use_32bit_indexing) { + int64_t output_memory_size = iter.element_size(0); + for (int dim = 0; dim < iter.ndim(); dim++) { + output_memory_size = std::max(output_memory_size, iter.shape()[dim] * iter.strides(0)[dim]); + } + output_memory_size /= iter.element_size(0); //iter.strides is in bytes + owned_buf_ptr.reset(new AccumulationBuffer(sizeof(arg_t), + sizeof(out_scalar_t), + (char*) iter.data_ptr(0), + output_memory_size * sizeof(arg_t))); + } else { + owned_buf_ptr.reset(new AccumulationBuffer()); + } + acc_buf_ptr = owned_buf_ptr.get(); + } + + if (!can_use_32bit_indexing) { + for (auto& sub_iter : iter.with_32bit_indexing()) { + int64_t sub_iter_base_idx = sub_iter.view_offsets()[0]; + + gpu_reduce_kernel(sub_iter, ops, ident, + acc_buf_ptr, sub_iter_base_idx); + } + return; + } + + const char* in_data = (char*)iter.data_ptr(iter.ntensors() - 1); + char* out_data = (char*)iter.data_ptr(0); + const auto noutputs = iter.noutputs(); + optional out_data_extra; + if (noutputs > 1) { + out_data_extra = (char*)iter.data_ptr(1); + } else { + out_data_extra = nullopt; + } + char* acc_data = acc_buf_ptr->get_acc_slice(out_data); + + ReduceConfig config = setReduceConfig(iter); + at::DataPtr buffer; + at::DataPtr semaphores; + if (config.should_global_reduce()) { + auto& allocator = *c10::cuda::CUDACachingAllocator::get(); + buffer = allocator.allocate(config.global_memory_size()); + semaphores = allocator.allocate(config.semaphore_size()); + + auto stream = at::cuda::getCurrentCUDAStream(); + AT_CUDA_CHECK(cudaMemsetAsync(semaphores.get(), 0, config.semaphore_size(), stream)); + } + + AT_ASSERT(can_use_32bit_indexing); + auto output_calc = make_output_calculator(iter); + auto input_calc = make_input_calculator(iter); + auto reduce = ReduceOp( + ops, + config, + input_calc, + output_calc, + in_data, + out_data, + out_data_extra, + acc_data, + buffer.get(), + (int*)semaphores.get(), + ident, + noutputs, + base_idx); + reduce.accumulate = iter.should_accumulate(); + reduce.final_output = iter.is_final_output(); + + launch_reduce_kernel::MAX_NUM_THREADS>(config, reduce); +} + +//TODO this is 100 lines of almost-copy-paste, because we have to have different template args for this function +//try unifying with gpu_reduce_kernel +template +inline void jitted_gpu_reduce_kernel(TensorIterator& iter, const std::string& func, ident_t ident=0, + AccumulationBuffer* acc_buf_ptr=nullptr, int64_t base_idx=0) { + AT_ASSERT(iter.numel() > 0 && iter.ntensors() - iter.noutputs() == 1 && iter.noutputs() >= 1); + + //TODO - this will be different for more complicated reductions, but for now reductions using + //func_wrapper all have arg_t = opmath + using arg_t = at::opmath_type; + // at::Half/at::ComplexHalf overflows easily as it's range is very small. + // So when scalar_t and out_scalar_t are at::Half/at::ComplexHalf, we + // set can_accumulate_in_output to False. + static constexpr bool is_inp_out_type_half_or_chalf = + (std::is_same::value && + std::is_same::value) || + (std::is_same, scalar_t>::value && + std::is_same, out_scalar_t>::value); + // at::BFloat16 has lower precision and can lead to rounding errors. + // So when scalar_t and out_scalar_t are at::BFloat16, we + // set can_accumulate_in_output to False. + static constexpr bool is_inp_out_type_bfloat16 = + (std::is_same::value && + std::is_same::value); + static constexpr bool can_accumulate_in_output = + std::is_convertible::value && + !(is_inp_out_type_half_or_chalf || is_inp_out_type_bfloat16); + + bool can_use_32bit_indexing = iter.can_use_32bit_indexing(); + std::unique_ptr owned_buf_ptr; + + // The acc_buf_ptr is a shared pointer. It is create at the first entrance and + // reused by all recursive function calls. + if (acc_buf_ptr == NULL) { + // acc_buf_ptr holds buffer used for accumulation among multiple sub_iter + // when accumulation in output is not possible. + if (!can_accumulate_in_output && !can_use_32bit_indexing) { + int64_t output_memory_size = iter.element_size(0); + for (int dim = 0; dim < iter.ndim(); dim++) { + output_memory_size = std::max(output_memory_size, iter.shape()[dim] * iter.strides(0)[dim]); + } + output_memory_size /= iter.element_size(0); //iter.strides is in bytes + owned_buf_ptr.reset(new AccumulationBuffer(sizeof(out_scalar_t), //TODO + sizeof(out_scalar_t), + (char*) iter.data_ptr(0), + output_memory_size * sizeof(out_scalar_t))); //TODO + } else { + owned_buf_ptr.reset(new AccumulationBuffer()); + } + acc_buf_ptr = owned_buf_ptr.get(); + } + + if (!can_use_32bit_indexing) { + for (auto& sub_iter : iter.with_32bit_indexing()) { + int64_t sub_iter_base_idx = sub_iter.view_offsets()[0]; + + jitted_gpu_reduce_kernel(sub_iter, func, ident, + acc_buf_ptr, sub_iter_base_idx); + } + return; + } + + //TODO - for now we support a single input, we may be able to relax this constraint + const char* in_data = (char*)iter.data_ptr(iter.ntensors() - 1); + char* out_data = (char*)iter.data_ptr(0); + const auto noutputs = iter.noutputs(); + optional out_data_extra; + if (noutputs > 1) { + out_data_extra = (char*)iter.data_ptr(1); + } else { + out_data_extra = nullopt; + } + char* acc_data = acc_buf_ptr->get_acc_slice(out_data); + + ReduceConfig config = setReduceConfig(iter); + + at::DataPtr buffer; + at::DataPtr semaphores; + if (config.should_global_reduce()) { + auto& allocator = *c10::cuda::CUDACachingAllocator::get(); + buffer = allocator.allocate(config.global_memory_size()); + semaphores = allocator.allocate(config.semaphore_size()); + + auto stream = at::cuda::getCurrentCUDAStream(); + AT_CUDA_CHECK(cudaMemsetAsync(semaphores.get(), 0, config.semaphore_size(), stream)); + } + + AT_ASSERT(can_use_32bit_indexing); + auto output_calc = make_output_calculator(iter); + auto input_calc = make_input_calculator(iter); + auto reduce = ReduceJitOp( + config, + input_calc, + output_calc, + in_data, + out_data, + out_data_extra, + acc_data, + buffer.get(), + (int*)semaphores.get(), + ident, + noutputs, + base_idx); + reduce.accumulate = iter.should_accumulate(); + reduce.final_output = iter.is_final_output(); + + constexpr int nInputs = 1; + constexpr int nOutputs = 1; + static auto desc = at::cuda::jit::make_kernel_descriptor< + out_scalar_t, scalar_t>(name, func, nInputs, nOutputs); + + static std::mutex jiterator_mutex; + static std::vector> fn_cache(c10::cuda::device_count()); + auto &cache = fn_cache[iter.device().index()]; + + launch_jitted_reduce_kernel( + jiterator_mutex, cache, desc, vt0, config, &reduce); +} + +}} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/ReduceOps.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/ReduceOps.h new file mode 100644 index 0000000000000000000000000000000000000000..a67a019ae49e2ec73be935aa4cae703417e9bb19 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/ReduceOps.h @@ -0,0 +1,20 @@ + +namespace at { +struct TensorIterator; +} + +namespace c10 { +class Scalar; +} + +namespace at { namespace native { + +void norm_launch_kernel(TensorIterator &iter, double val); +void min_launch_kernel(TensorIterator &iter); +void max_launch_kernel(TensorIterator &iter); +void aminmax_launch_kernel(TensorIterator &iter); +void min_all_launch_kernel(TensorIterator &iter); +void max_all_launch_kernel(TensorIterator &iter); +void aminmax_allreduce_launch_kernel(TensorIterator &iter); + +}} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/ScanKernels.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/ScanKernels.h new file mode 100644 index 0000000000000000000000000000000000000000..28e65372511bc7b50390a134e01554b6fa9ee171 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/ScanKernels.h @@ -0,0 +1,18 @@ +#pragma once +#include + +namespace at { +class TensorBase; + +namespace native { + +// NOTE: these functions require output tensors to be contiguous +void launch_cummax_cuda_kernel(const TensorBase& self, const TensorBase& values, + const TensorBase& indices, int64_t dim); +void launch_cummin_cuda_kernel(const TensorBase& self, const TensorBase& values, + const TensorBase& indices, int64_t dim); +void launch_logcumsumexp_cuda_kernel(const TensorBase& result, const TensorBase& self, int64_t dim); +void launch_cumsum_cuda_kernel(const TensorBase& result, const TensorBase& self, int64_t dim); +void launch_cumprod_cuda_kernel(const TensorBase& result, const TensorBase& self, int64_t dim); + +}} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/ScanUtils.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/ScanUtils.cuh new file mode 100644 index 0000000000000000000000000000000000000000..f9de15fdf912b47635b2cb1388001750eec50959 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/ScanUtils.cuh @@ -0,0 +1,459 @@ +#pragma once +#include +#include +#include +#include + +#include +#include +#include + +namespace at { +namespace native { + +template +constexpr inline integer ceil_div(integer n, integer m) { + return (n + m - 1) / m; +} + +template +constexpr inline integer get_log_num_threads_x_inner_scan(integer num_rows, integer row_size) { + integer log_num_threads_x = 0; + integer log_num_threads_y = 0; + while (((integer)1 << log_num_threads_x) < row_size) { + ++log_num_threads_x; + } + while (((integer)1 << log_num_threads_y) < num_rows) { + ++log_num_threads_y; + } + // we want to keep the ratio between the x-threads and y-threads about the same as + // the ratio between the row_size and num_rows, but the total number of threads in + // a block should be about 512 + integer diff = log_num_threads_x - log_num_threads_y; + // 9 is from log2(512) + log_num_threads_x = ((integer)9 + diff) / (integer)2; + // I found that in having larger log_num_threads_x can give significant speed up in some cases, + // but detrimental in another case, so just keep the lower bound to be log2(16) == 4 to make it + // similar to the previous implementation + // Keeping the upper bound to be log2(512) == 9 as the maximum number of threads in a block. + log_num_threads_x = std::min(std::max((integer)4, log_num_threads_x), (integer)9); + return log_num_threads_x; +} + +template +__device__ void binary_op_update(const scalar_t lhs, scalar_t& rhs, const idx_t lhs_idx, idx_t& rhs_idx, BinaryOperation binary_op) { + if(!at::_isnan(rhs) && (at::_isnan(lhs) || !binary_op(rhs, lhs))) { + rhs = lhs; + rhs_idx = lhs_idx; + } +} +/* Perform an inclusive scan along the innermost dimension of a tensor. + * + * - num_rows is the size of the flattened outer dimensions; + * - row_size is the size of the innermost dimension; + * + * The outer dimensions of the tensor are considered as a single dimension, i.e. the tensor is + * considered as having 'num_rows' rows of size 'row_size'. + * Each thread block processes one or more sets of contiguous rows (processing multiple rows + * per thread block is quicker than processing a single row, especially for short rows). + */ +template +__global__ void tensor_kernel_scan_innermost_dim_with_indices(const scalar_t *self_, scalar_t *values_, int64_t *indices_, + int num_rows, int row_size, + const uint32_t num_threads, const uint32_t log_num_threads_x, + scalar_t init, BinaryFunction binary_op) { + // dynamic memory allocation for vbuf and ibuf + alignas(sizeof(double)) extern __shared__ char buf[]; + scalar_t* vbuf = reinterpret_cast(buf); // the size is num_threads * 2 + int64_t* ibuf = reinterpret_cast(vbuf + num_threads * 2); + const uint32_t num_threads_x = 1 << log_num_threads_x; + scalar_t* row_buf = vbuf + 2 * num_threads_x * threadIdx.y; + int64_t* row_idx_buf = ibuf + 2 * num_threads_x * threadIdx.y; + + for (int block_row = blockIdx.x * blockDim.y; + block_row < num_rows; + block_row += blockDim.y * gridDim.x) { + int row = block_row + threadIdx.y; + const scalar_t *row_self = self_ + row * row_size; + scalar_t *row_values = values_ + row * row_size; + int64_t *row_indices = indices_ + row * row_size; + scalar_t block_total = init; + int64_t block_idx_final = 0; + const bool row_exists = row < num_rows; + // Perform scan on one block at a time, keeping track of the total value of + // all blocks processed so far. + for (int block_col = 0; block_col < row_size; block_col += 2 * num_threads_x) { + // Load data into shared memory (two values per thread). + int col1 = block_col + threadIdx.x; + int col2 = block_col + num_threads_x + threadIdx.x; + if (row_exists) { + if (col1 < row_size) { + row_buf[threadIdx.x] = c10::load(&row_self[col1]); + row_idx_buf[threadIdx.x] = col1; + } else { + row_buf[threadIdx.x] = init; + // No need to set the index here as the value in init will never be selected + } + + if (col2 < row_size) { + row_buf[num_threads_x + threadIdx.x] = c10::load(&row_self[col2]); + row_idx_buf[num_threads_x + threadIdx.x] = col2; + } else { + row_buf[num_threads_x + threadIdx.x] = init; + // No need to set the index here as the value in init will never be selected + } + + // Add the total value of all previous blocks to the first value of this block. + if (threadIdx.x == 0) { + binary_op_update(block_total, row_buf[0], block_idx_final, row_idx_buf[0], binary_op); + } + } + __syncthreads(); + + // Parallel reduction with Sklansky method. The diagram can be seen on this paper: + // https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back + for (uint32_t s = 1; s <= num_threads_x; s <<= 1) { + if (row_exists) { + uint32_t a = (threadIdx.x / s) * (2 * s) + s; + uint32_t ti = a + (threadIdx.x % s); + uint32_t si = a - 1; + binary_op_update(row_buf[si], row_buf[ti], row_idx_buf[si], row_idx_buf[ti], binary_op); + } + __syncthreads(); + } + + // Write back to output. + if (row_exists) { + if (col1 < row_size){ + row_values[col1] = row_buf[threadIdx.x]; + row_indices[col1] = row_idx_buf[threadIdx.x]; + } + if (col2 < row_size) { + row_values[col2] = row_buf[num_threads_x + threadIdx.x]; + row_indices[col2] = row_idx_buf[num_threads_x + threadIdx.x]; + } + } + block_total = row_buf[2 * num_threads_x - 1]; + block_idx_final = row_idx_buf[2 * num_threads_x - 1]; + __syncthreads(); + } + } +} + +/* Perform an inclusive scan along an outer dimension of a tensor. + * + * - num_orows is the size of the flattened outer dimensions; + * - num_irows is the size of the flattened inner dimensions; + * - row_size is the size of the dimension along which to compute the variance; + * + * The dimensions to the outside and inside of the specified dimension are considered as flattened. + * Thread blocks with the same blockIdx.y process an "outer row" (i.e. an element of the flattened + * outer dimensions, which contains several "inner rows"). + * Each thread processes a single inner row at a time. + */ +template +__global__ void tensor_kernel_scan_outer_dim_with_indices(const scalar_t *self_, scalar_t *values_, int64_t *indices_, + const uint32_t num_orows, const uint32_t num_irows, const uint32_t row_size, scalar_t init, BinaryFunction binary_op) { + for (uint32_t orow = blockIdx.x; orow < num_orows; orow += gridDim.x) { + for (uint32_t irow = blockIdx.y * blockDim.x + threadIdx.x; irow < num_irows; irow += gridDim.y * blockDim.x) { + const scalar_t *self = self_ + orow * row_size * num_irows + irow; + scalar_t *values = values_ + orow * row_size * num_irows + irow; + int64_t *indices = indices_ + orow * row_size * num_irows + irow; + scalar_t out = init; + int64_t out_idx = 0; + + for (auto col = decltype(row_size){0}; col < row_size; ++col) { + const auto val = c10::load(self); + if(at::_isnan(val) || (!at::_isnan(out) && binary_op(val, out))) { + out = val; + out_idx = col; + } + *values = out; + *indices = out_idx; + self += num_irows; + values += num_irows; + indices += num_irows; + } + } + } +} + +inline void check_fits_in_unsigned(int64_t val, const char* name) { + constexpr auto umax = std::numeric_limits::max(); + TORCH_CHECK( + val >= 0 && val <= umax, name, " must fit in a 32-bit uint32_t value"); +} + + +template +__host__ void scan_outer_dim_with_indices( + const TensorBase& self, const TensorBase& values, const TensorBase& indices, + int dim, scalar_t init, BinaryFunction binary_op) { + int64_t row_size = self.size(dim); + auto sizes = self.sizes(); + + // Treat all outer dimensions (i.e. dim_ < dim) as one. + const int64_t num_orows = c10::multiply_integers(sizes.begin(), sizes.begin() + dim); + + // Treat all inner dimensions (i.e. dim > dimension) as one. + const int64_t num_irows = c10::multiply_integers(sizes.begin() + dim + 1, sizes.end()); + //for performance reasons, cuda kernels use uint32_t for loops over irows, orows and row, + //make sure that input is not bigger than supported by uint32_t + check_fits_in_unsigned(num_irows, "num_irows"); + check_fits_in_unsigned(num_orows, "num_orows"); + check_fits_in_unsigned(row_size, "row_size"); + + + dim3 threads(std::min(512, int(num_irows))); + int64_t maxGridDim = at::cuda::getCurrentDeviceProperties()->maxGridSize[1]; + dim3 grid(std::min(maxGridDim, num_orows), std::min(maxGridDim, ceil_div(num_irows, int64_t{threads.x}))); + tensor_kernel_scan_outer_dim_with_indices<<>>( + self.const_data_ptr(), values.mutable_data_ptr(), indices.mutable_data_ptr(), + num_orows, num_irows, row_size, init, binary_op); + C10_CUDA_KERNEL_LAUNCH_CHECK(); +} + +template +__host__ void scan_innermost_dim_with_indices( + const TensorBase& self, const TensorBase& values, const TensorBase& indices, + scalar_t init, BinaryFunction binary_op) { + int ndim = self.dim(); + // Treat all outer dimensions as a single dimension. + int row_size = self.size(ndim - 1); + int num_rows = self.numel() / row_size; + + // assuming max_num_threads per block is 512 + const uint32_t num_threads = 512; + const uint32_t log_num_threads_x = get_log_num_threads_x_inner_scan(num_rows, row_size); + const uint32_t num_threads_x = (1 << log_num_threads_x); + const uint32_t num_threads_y = num_threads / num_threads_x; + dim3 threads(num_threads_x, num_threads_y); + dim3 grid(std::min(at::cuda::getCurrentDeviceProperties()->maxGridSize[0], ceil_div(num_rows, int(threads.y)))); + + const uint32_t mem_size = 2 * num_threads * (sizeof(scalar_t) + sizeof(int64_t)); + tensor_kernel_scan_innermost_dim_with_indices<<>>( + self.const_data_ptr(), values.mutable_data_ptr(), indices.mutable_data_ptr(), + num_rows, row_size, num_threads, log_num_threads_x, init, binary_op); + C10_CUDA_KERNEL_LAUNCH_CHECK(); +} + +template +void scan_dim_with_indices(const TensorBase& self, const TensorBase& values, const TensorBase& indices, //int64_t dim) { + int64_t dim, scalar_t init, BinaryFunction binary_op) { + int ndim = self.dim(); + auto self_ = self.expect_contiguous(); + TORCH_INTERNAL_ASSERT(values.is_contiguous() && indices.is_contiguous()); + if (dim == ndim - 1) { + scan_innermost_dim_with_indices(*self_, values, indices, init, binary_op); + } else { + scan_outer_dim_with_indices(*self_, values, indices, dim, init, binary_op); + } +} + +// TODO: The implementation of `tensor_kernel_scan_outer_dim` and +// `tensor_kernel_scan_innermost_dim` is similar to +// `tensor_kernel_scan_outer_dim_with_indices` +// `tensor_kernel_scan_outer_dim_with_indices` and should be refactored to +// remove the duplication. + +/* Perform an inclusive scan along an outer dimension of a tensor. + * + * - num_orows is the size of the flattened outer dimensions; + * - num_irows is the size of the flattened inner dimensions; + * - row_size is the size of the dimension along which to scan; + * + * The dimensions to the outside and inside of the specified dimension are considered as flattened. + * Thread blocks with the same blockIdx.y process an "outer row" (i.e. an element of the flattened + * outer dimensions, which contains several "inner rows"). + * Each thread processes a single inner row at a time. + */ +template +__global__ void tensor_kernel_scan_outer_dim(scalar_t *tgt_, const scalar_t *src_, + const uint32_t num_orows, const uint32_t num_irows, const uint32_t row_size, + const scalar_t init, BinaryOp binary_op) +{ + for (uint32_t orow = blockIdx.x; orow < num_orows; orow += gridDim.x) { + for (uint32_t irow = blockIdx.y * blockDim.x + threadIdx.x; irow < num_irows; irow += gridDim.y * blockDim.x) { + const scalar_t *src = src_ + orow * row_size * num_irows + irow; + scalar_t *tgt = tgt_ + orow * row_size * num_irows + irow; + scalar_t acc = init; + + for (uint32_t col = 0; col < row_size; ++col) { + acc = binary_op(acc, c10::load(src)); + *tgt = acc; + + src += num_irows; + tgt += num_irows; + } + } + } +} + +/* Perform an inclusive scan along the innermost dimension of a tensor. + * + * - num_rows is the size of the flattened outer dimensions; + * - row_size is the size of the innermost dimension; + * + * The outer dimensions of the tensor are considered as a single dimension, i.e. the tensor is + * considered as having 'num_rows' rows of size 'row_size'. + * Each thread block processes one or more sets of contiguous rows (processing multiple rows + * per thread block is quicker than processing a single row, especially for short rows). + */ +template +__device__ void tensor_kernel_scan_innermost_dim_impl(T* row_buf, T *tgt_, const T *src_, + const uint32_t num_rows, const uint32_t row_size, + const uint32_t log_num_threads_x, + T init, BinaryFunction binary_op){ + const uint32_t num_threads_x = 1 << log_num_threads_x; + for (uint32_t block_row = blockIdx.x * blockDim.y; + block_row < num_rows; + block_row += blockDim.y * gridDim.x) { + uint32_t row = block_row + threadIdx.y; + T block_total = init; + + const T *row_src = src_ + row * row_size; + T *row_tgt = tgt_ + row * row_size; + const bool row_exists = row < num_rows; + + // Perform scan on one block at a time, keeping track of the total value of + // all blocks processed so far. + for (uint32_t block_col = 0; block_col < row_size; block_col += 2 * num_threads_x) { + // Load data into shared memory (two values per thread). + uint32_t col1 = block_col + threadIdx.x; + uint32_t col2 = block_col + num_threads_x + threadIdx.x; + if (row_exists) { + if (col1 < row_size) { + row_buf[threadIdx.x] = row_src[col1]; + } else { + row_buf[threadIdx.x] = init; + } + + if (col2 < row_size) { + row_buf[num_threads_x + threadIdx.x] = row_src[col2]; + } else { + row_buf[num_threads_x + threadIdx.x] = init; + } + + // Add the total value of all previous blocks to the first value of this block. + if (threadIdx.x == 0) { + row_buf[0] = binary_op(row_buf[0], block_total); + } + } + __syncthreads(); + + // Parallel reduction with Sklansky method. The diagram can be seen on this paper: + // https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back + for (uint32_t m = 0; m <= log_num_threads_x; ++m) { + if (row_exists) { + uint32_t s = 1 << m; // s = 2 ^ m + uint32_t a = ((threadIdx.x >> m) << (m + 1)) | s; // a = (threadIdx.x / s) * (2 * s) + s + uint32_t ti = a + (threadIdx.x % s); + uint32_t si = a - 1; + row_buf[ti] = binary_op(row_buf[ti], row_buf[si]); + } + __syncthreads(); + } + + // Write back to output. + if (row_exists) { + if (col1 < row_size) row_tgt[col1] = row_buf[threadIdx.x]; + if (col2 < row_size) row_tgt[col2] = row_buf[num_threads_x + threadIdx.x]; + } + block_total = row_buf[2 * num_threads_x - 1]; + __syncthreads(); + } + } +} + +template < + typename T, + class BinaryFunction> +__global__ void tensor_kernel_scan_innermost_dim( + T* tgt_, + const T* src_, + const uint32_t num_rows, + const uint32_t row_size, + const uint32_t log_num_threads_x, + T init, + BinaryFunction binary_op) { + alignas(sizeof(double)) extern __shared__ char sbuf[]; + T* sbuf2 = reinterpret_cast(sbuf); + const uint32_t num_threads_x = 1 << log_num_threads_x; + T* row_buf = reinterpret_cast(sbuf2 + num_threads_x * 2 * threadIdx.y); + + tensor_kernel_scan_innermost_dim_impl( + row_buf, tgt_, src_, num_rows, row_size, log_num_threads_x, init, binary_op); +} + + +template +__host__ void scan_outer_dim(const TensorBase& self, const TensorBase& result, + int dim, scalar_t init, BinaryFunction binary_op) { + const int64_t row_size = self.size(dim); + auto sizes = self.sizes(); + + // Treat all outer dimensions (i.e. dim_ < dim) as one. + const int64_t num_orows = c10::multiply_integers(sizes.begin(), sizes.begin() + dim); + + // Treat all inner dimensions (i.e. dim > dimension) as one. + const int64_t num_irows = c10::multiply_integers(sizes.begin() + dim + 1, sizes.end()); + + dim3 threads(std::min(512, int(num_irows))); + int64_t maxGridDim = at::cuda::getCurrentDeviceProperties()->maxGridSize[1]; + dim3 grid(std::min(maxGridDim, num_orows), std::min(maxGridDim, ceil_div(num_irows, int64_t{threads.x}))); + + check_fits_in_unsigned(num_irows, "num_irows"); + check_fits_in_unsigned(num_orows, "num_orows"); + check_fits_in_unsigned(row_size, "row_size"); + + tensor_kernel_scan_outer_dim<<>>( + result.mutable_data_ptr(), self.const_data_ptr(), + num_orows, num_irows, row_size, init, binary_op); + C10_CUDA_KERNEL_LAUNCH_CHECK(); +} + +template +void scan_innermost_dim(const TensorBase& self, const TensorBase& result, + scalar_t init, BinaryFunction binary_op) { + int64_t ndim = self.dim(); + // Treat all outer dimensions as a single dimension. + int64_t row_size = self.size(ndim - 1); + int64_t num_rows = self.numel() / row_size; + + // assuming max_num_threads per block is 512 + const uint32_t num_threads = 512; + const uint32_t log_num_threads_x = get_log_num_threads_x_inner_scan(num_rows, row_size); + const uint32_t num_threads_x = (1 << log_num_threads_x); + const uint32_t num_threads_y = num_threads / num_threads_x; + dim3 threads(num_threads_x, num_threads_y); + int64_t maxGridDim = at::cuda::getCurrentDeviceProperties()->maxGridSize[0]; + dim3 grid(std::min(maxGridDim, ceil_div(num_rows, int64_t{threads.y}))); + + check_fits_in_unsigned(num_rows, "Number of rows (self.numel()/self.size(self.dim()-1))"); + check_fits_in_unsigned(row_size, "row_size"); + + tensor_kernel_scan_innermost_dim<<>>( + result.mutable_data_ptr(), self.const_data_ptr(), + num_rows, row_size, log_num_threads_x, init, binary_op); + C10_CUDA_KERNEL_LAUNCH_CHECK(); +} + +template +void scan_dim(const TensorBase& self, const TensorBase& result, + int64_t dim, scalar_t init, BinaryFunction binary_op) { + int ndim = self.dim(); + auto self_ = self.expect_contiguous(); + TORCH_INTERNAL_ASSERT(result.is_contiguous()); + + if (self.numel() == self.size(dim)) { + cuda::cub::inclusive_scan(self_->const_data_ptr(), result.mutable_data_ptr(), binary_op, self.numel()); + } else if (dim == ndim - 1) { + scan_innermost_dim(*self_, result, init, binary_op); + } else { + scan_outer_dim(*self_, result, dim, init, binary_op); + } +} + +}} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/SortStable.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/SortStable.h new file mode 100644 index 0000000000000000000000000000000000000000..039c4307c522c9f81bf88554483f67a26127561a --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/SortStable.h @@ -0,0 +1,19 @@ +#pragma once +#include +#include + +namespace at { +namespace native { + +// Stable-sort self into values, and set indices to the +// inverse-permutation from values back to self. +// Output tensors must be pre-allocated and contiguous. +void launch_stable_sort_kernel( + const TensorBase& self, + int64_t dim, + bool descending, + const TensorBase& values, + const TensorBase& indices); + +} // namespace native +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/SortingRadixSelect.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/SortingRadixSelect.cuh new file mode 100644 index 0000000000000000000000000000000000000000..1aeaca19652a652db6ff3aded81e2bdec8b3a4af --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/SortingRadixSelect.cuh @@ -0,0 +1,429 @@ +#include +#include +#include +#include +#include + +namespace at { +namespace native { + +template +struct TopKTypeConfig {}; + +template <> +struct TopKTypeConfig { + typedef uint32_t RadixType; + + // Converts a float to an integer representation with the same + // sorting; i.e., for floats f1, f2: + // if f1 < f2 then convert(f1) < convert(f2) + // We use this to enable radix selection of floating-point values. + // This also gives a relative order for NaNs, but that's ok, as they + // will all be adjacent + // neg inf: signbit=1 exp=ff fraction=0 --> radix = 0 00 ff.. + // pos inf: signbit=0 exp=ff fraction=0 --> radix = 1 ff 00.. + // pos nan: signbit=0 exp=ff fraction>0 --> radix = 1 ff x>0 + // neg nan: signbit=1 exp=ff fraction>0 --> radix = 0 00 x +struct TopKTypeConfig { + typedef uint32_t RadixType; + + static inline __device__ RadixType convert(uint8_t v) { + return v; + } + + static inline __device__ uint8_t deconvert(RadixType v) { + return v; + } +}; + +template <> +struct TopKTypeConfig { + typedef uint32_t RadixType; + + static inline __device__ RadixType convert(int8_t v) { + return 128u + v; + } + + static inline __device__ int8_t deconvert(RadixType v) { + return v - 128; + } +}; + +template <> +struct TopKTypeConfig { + typedef uint32_t RadixType; + + static inline __device__ RadixType convert(int16_t v) { + static_assert(sizeof(short) == 2, ""); + return 32768u + v; + } + + static inline __device__ int16_t deconvert(RadixType v) { + return v - 32768; + } +}; + +template <> +struct TopKTypeConfig { + typedef uint32_t RadixType; + + static inline __device__ RadixType convert(int32_t v) { + static_assert(sizeof(int) == 4, ""); + return 2147483648u + v; + } + + static inline __device__ int32_t deconvert(RadixType v) { + return v - 2147483648u; + } +}; + +template <> +struct TopKTypeConfig { + typedef uint64_t RadixType; + + static inline __device__ RadixType convert(int64_t v) { + static_assert(sizeof(int64_t) == 8, ""); + return 9223372036854775808ull + v; + } + + static inline __device__ int64_t deconvert(RadixType v) { + return v - 9223372036854775808ull; + } +}; + +template <> +struct TopKTypeConfig { + typedef uint64_t RadixType; + + static inline __device__ RadixType convert(double v) { + RadixType x = __double_as_longlong(v); + RadixType mask = -((x >> 63)) | 0x8000000000000000; + return (v == v) ? (x ^ mask) : 0xffffffffffffffff; + } + + static inline __device__ double deconvert(RadixType v) { + RadixType mask = ((v >> 63) - 1) | 0x8000000000000000; + return __longlong_as_double(v ^ mask); + } +}; + +template <> +struct TopKTypeConfig { + typedef uint32_t RadixType; + + static inline __device__ RadixType convert(at::Half v) { +#if defined(__CUDA_ARCH__) || defined(USE_ROCM) + RadixType x = __half_as_ushort(v); + RadixType mask = (x & 0x00008000) ? 0x0000ffff : 0x00008000; + return (v == v) ? (x ^ mask) : 0xffff; +#else + CUDA_KERNEL_ASSERT(false); + return 0u; +#endif + } + + static inline __device__ at::Half deconvert(RadixType v) { +#if defined(__CUDA_ARCH__) || defined(USE_ROCM) + RadixType mask = (v & 0x00008000) ? 0x00008000 : 0x0000ffff; + return __ushort_as_half(v ^ mask); +#else + CUDA_KERNEL_ASSERT(false); + return static_cast(0); +#endif + } +}; + +template <> +struct TopKTypeConfig { + typedef uint32_t RadixType; + + static inline __device__ RadixType convert(at::BFloat16 v) { + RadixType x = v.x; + RadixType mask = (x & 0x00008000) ? 0x0000ffff : 0x00008000; + return (v == v) ? (x ^ mask) : 0xffff; + } + + static inline __device__ at::BFloat16 deconvert(RadixType v) { + RadixType mask = (v & 0x00008000) ? 0x00008000 : 0x0000ffff; + at::BFloat16 r; + r.x = (v ^ mask); + return r; + } +}; + +// This function counts the distribution of all input values in a +// slice we are selecting by radix digit at `radixDigitPos`, but only +// those that pass the filter `((v & desiredMask) == desired)`. +// This produces and broadcasts the seen counts for a single block only. +// `smem` must have at least `RadixSize` elements. +template < + typename scalar_t, + typename bitwise_t, + typename index_t, + typename CountType, + int RadixSize, + int RadixBits> +__device__ void countRadixUsingMask( + CountType counts[RadixSize], + CountType* smem, + bitwise_t desired, + bitwise_t desiredMask, + int radixDigitPos, + index_t sliceSize, + index_t withinSliceStride, + const scalar_t* data) { + // Clear out per-thread counts from a previous round +#pragma unroll + for (int i = 0; i < RadixSize; ++i) { + counts[i] = 0; + } + + if (threadIdx.x < RadixSize) { + smem[threadIdx.x] = 0; + } + __syncthreads(); + + // Scan over all the data. Upon a read, the warp will accumulate + // counts per each digit in the radix using warp voting. +#if !defined(USE_ROCM) + // Must be called outside of loop to ensure all threads participate + unsigned mask = WARP_BALLOT(threadIdx.x < sliceSize); +#endif + for (index_t i = threadIdx.x; i < sliceSize;) { + bitwise_t val = + TopKTypeConfig::convert(doLdg(&data[i * withinSliceStride])); + + bool hasVal = ((val & desiredMask) == desired); + bitwise_t digitInRadix = at::cuda::Bitfield::getBitfield( + val, radixDigitPos, RadixBits); + +#pragma unroll + for (uint32_t j = 0; j < RadixSize; ++j) { + bool vote = hasVal && (digitInRadix == j); +#if defined(USE_ROCM) + counts[j] += __popcll(WARP_BALLOT(vote)); +#else + counts[j] += __popc(WARP_BALLOT(vote, mask)); +#endif + } + i += blockDim.x; +#if !defined(USE_ROCM) + mask = WARP_BALLOT(i < sliceSize, mask); +#endif + } + + // Now, for each warp, sum values + if (at::cuda::getLaneId() == 0) { +#pragma unroll + for (uint32_t i = 0; i < RadixSize; ++i) { + gpuAtomicAddNoReturn(&smem[i], counts[i]); + } + } + + __syncthreads(); + + // For each thread, read in the total counts +#pragma unroll + for (uint32_t i = 0; i < RadixSize; ++i) { + counts[i] = smem[i]; + } + + __syncthreads(); +} + +// Over what radix we are selecting values +constexpr int RADIX_BITS = 2; // digits are base-(2 ^ RADIX_BITS) +constexpr int RADIX_SIZE = 4; // 2 ^ RADIX_BITS +constexpr int RADIX_MASK = (RADIX_SIZE - 1); + +// This finds the unique value `v` that matches the pattern +// ((v & desired) == desiredMask) in our sorted int format +template +__device__ scalar_t findPattern( + scalar_t* smem, + const scalar_t* data, + index_t sliceSize, + index_t withinSliceStride, + bitwise_t desired, + bitwise_t desiredMask) { + if (threadIdx.x < 2) { + smem[threadIdx.x] = static_cast(0); + } + __syncthreads(); + + // All threads participate in the loop, in order to sync on the flag + index_t numIterations = + round_up(sliceSize, static_cast(blockDim.x)); + for (index_t i = threadIdx.x; i < numIterations; i += blockDim.x) { + bool inRange = (i < sliceSize); + scalar_t v = inRange ? doLdg(&data[i * withinSliceStride]) + : static_cast(0); + + if (inRange && + ((TopKTypeConfig::convert(v) & desiredMask) == desired)) { + // There should not be conflicts if we are using findPattern, + // since the result is unique + smem[0] = static_cast(1); + smem[1] = v; // can't use val as the flag, since it could be 0 + } + + __syncthreads(); + + scalar_t found = smem[0]; + scalar_t val = smem[1]; + + __syncthreads(); + + // Check to see if a thread found the value + if (found != static_cast(0)) { + // all threads return this value + return val; + } + } + + // should not get here + CUDA_KERNEL_ASSERT(false); + return static_cast(0); +} + +// Returns the top-Kth element found in the data using radix selection +template +__device__ void radixSelect( + const scalar_t* data, + index_t k, + bool largest, + index_t sliceSize, + index_t withinSliceStride, + int* smem, + scalar_t* topK) { + // Per-thread buckets into which we accumulate digit counts in our + // radix + int counts[RADIX_SIZE]; + + // We only consider elements x such that (x & desiredMask) == desired + // Initially, we consider all elements of the array, so the above + // statement is true regardless of input. + bitwise_t desired = 0; + bitwise_t desiredMask = 0; + + // We are looking for the top kToFind-th element when iterating over + // digits; this count gets reduced by elimination when counting + // successive digits + int kToFind = k; + + // We start at the most significant digit in our radix, scanning + // through to the least significant digit + for (int digitPos = sizeof(scalar_t) * 8 - RADIX_BITS; digitPos >= 0; + digitPos -= RADIX_BITS) { + // Count radix distribution for the current position and reduce + // across all threads + countRadixUsingMask< + scalar_t, + bitwise_t, + index_t, + int, + RADIX_SIZE, + RADIX_BITS>( + counts, + smem, + desired, + desiredMask, + digitPos, + sliceSize, + withinSliceStride, + data); + + auto found_unique = [&](int i, int count) -> bool { + /* All threads have the same value in counts here, so all */ + /* threads will return from the function. */ + if (count == 1 && kToFind == 1) { + /* There is a unique answer. */ + desired = at::cuda::Bitfield::setBitfield( + desired, i, digitPos, RADIX_BITS); + desiredMask = at::cuda::Bitfield::setBitfield( + desiredMask, RADIX_MASK, digitPos, RADIX_BITS); + + /* The answer is now the unique element v such that: */ + /* (v & desiredMask) == desired */ + /* However, we do not yet know what the actual element is. We */ + /* need to perform a search through the data to find the */ + /* element that matches this pattern. */ + *topK = findPattern( + (scalar_t*)smem, + data, + sliceSize, + withinSliceStride, + desired, + desiredMask); + return true; + } + return false; + }; + auto found_non_unique = [&](int i, int count) -> bool { + if (count >= kToFind) { + desired = + at::cuda::Bitfield::setBitfield( + desired, i, digitPos, RADIX_BITS); + desiredMask = at::cuda::Bitfield::setBitfield( + desiredMask, RADIX_MASK, digitPos, RADIX_BITS); + + /* The top-Kth element v must now be one such that: */ + /* (v & desiredMask == desired) */ + /* but we haven't narrowed it down; we must check the next */ + /* least-significant digit */ + return true; + } + kToFind -= count; + return false; // continue the loop + }; + + // All threads participate in the comparisons below to know the + // final result + if (largest) { + // Process in descending order +#pragma unroll + for (int i = RADIX_SIZE - 1; i >= 0; --i) { + int count = counts[i]; + if (found_unique(i, count)) { + return; + } + if (found_non_unique(i, count)) { + break; + } + } + } else { + // Process in ascending order +#pragma unroll + for (int i = 0; i < RADIX_SIZE; ++i) { + int count = counts[i]; + if (found_unique(i, count)) { + return; + } + if (found_non_unique(i, count)) { + break; + } + } + } + } // end digitPos for + + // There is no unique result, but there is a non-unique result + // matching `desired` exactly + *topK = TopKTypeConfig::deconvert(desired); +} +} // namespace native +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/UniqueCub.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/UniqueCub.cuh new file mode 100644 index 0000000000000000000000000000000000000000..6e1cccc2e175cb26b1ee12690d67d1514e95f246 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/UniqueCub.cuh @@ -0,0 +1,16 @@ +#include + +namespace at { +namespace native { +namespace internal { + +template +std::tuple unique_cuda_template( + const Tensor& self, + const bool consecutive, + const bool return_inverse, + const bool return_counts); + +} // namespace internal +} // namespace at +} // namespace native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/UpSample.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/UpSample.cuh new file mode 100644 index 0000000000000000000000000000000000000000..b7f97088c5ff38e8c46edafdd8dc3c55c330ca3c --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/UpSample.cuh @@ -0,0 +1,370 @@ +#pragma once +#include +#include + +#include +#include +#include +#include + +#include + +namespace at { +namespace native { + +namespace upsample { +// TODO: Remove duplicate declaration. +TORCH_API c10::SmallVector compute_output_size( + c10::IntArrayRef input_size, // Full input tensor size. + at::OptionalIntArrayRef output_size, + c10::optional> scale_factors); +} // namespace upsample + +namespace upsample_cuda { + +// TODO: Remove duplication with Upsample.h (CPU). +inline c10::optional get_scale_value(c10::optional> scales, int idx) { + if (!scales) { + return nullopt; + } + return scales->at(idx); +} + +} // namespace upsample_cuda + + +/* TODO: move this to a common place */ +template +__device__ inline scalar_t min(scalar_t a, scalar_t b) { + return a < b ? a : b; +} + +template +__device__ inline scalar_t max(scalar_t a, scalar_t b) { + return a > b ? a : b; +} + +// NOTE [ Nearest neighbor upsampling kernel implementation ] +// +// The nearest neighbor upsampling kernel implementation is symmetrical as +// expected. We launch kernels with threads mapping to destination tensors where +// kernels write data to, each thread reads data from the source tensor, this +// means: +// 1. In the forward kernel, +// src_xxx refers to properties of input tensors; +// dst_xxx refers to properties of output tensors; +// scale_factor is the ratio of src_size to dst_size; +// 2. In the backward kernel, +// src_xxx refers to properties of grad_output tensors; +// dst_xxx refers to properties of grad_input tensors; +// scale_factor is the ratio of src_size to dst_size; +// +// Because of this, we need to take the reciprocal of the scale defined by +// upsample layer during forward path. The motivation is to avoid slow +// division in the kernel code, so we can use faster multiplication instead. +// This is not necessary during backward path, since the scale_factor is already +// the reciprocal of corresponding scale_factor used in the forward path due to +// the swap of source and destination tensor. +// +// Similarly, since the mapping from grad_input to grad_output during backward +// is the reverse of the mapping of output to input, we need to have opposite +// mapping functions to compute the source index. + +// see NOTE [ Nearest neighbor upsampling kernel implementation ] +template +__host__ __forceinline__ static accscalar_t compute_scales_value( + const c10::optional scale, + int64_t src_size, + int64_t dst_size) { + // FIXME: remove magic > 0 after we ensure no models were serialized with -1 defaults. + return (scale.has_value() && scale.value() > 0.) ? (accscalar_t)(1.0 / scale.value()) + : (accscalar_t)src_size / dst_size; +} + +// see NOTE [ Nearest neighbor upsampling kernel implementation ] +template +__host__ __forceinline__ static accscalar_t compute_scales_value_backwards( + const c10::optional scale, + int64_t src_size, + int64_t dst_size) { + // FIXME: remove magic > 0 after we ensure no models were serialized with -1 defaults. + return (scale.has_value() && scale.value() > 0.) ? (accscalar_t)scale.value() + : (accscalar_t)src_size / dst_size; +} + +template +__host__ __forceinline__ static accscalar_t area_pixel_compute_scale( + int input_size, + int output_size, + bool align_corners, + const c10::optional scale) { + if(align_corners) { + if(output_size > 1) { + return (accscalar_t)(input_size - 1) / (output_size - 1); + } + else { + return static_cast(0); + } + } + else{ + return compute_scales_value(scale, input_size, output_size); + } +} + +template +__device__ __forceinline__ static accscalar_t area_pixel_compute_source_index( + accscalar_t scale, + int dst_index, + bool align_corners, + bool cubic) { + if (align_corners) { + return scale * dst_index; + } else { + accscalar_t src_idx = scale * (dst_index + static_cast(0.5)) - + static_cast(0.5); + // See Note[Follow Opencv resize logic] + return (!cubic && src_idx < static_cast(0)) + ? static_cast(0) + : src_idx; + } +} + +// see NOTE [ Nearest neighbor upsampling kernel implementation ] +__device__ __forceinline__ static int nearest_neighbor_compute_source_index( + const float scale, + int dst_index, + int input_size) { + // index_f32 = (output_index) * scale + // input_index = round(index_f32) + // Same as a buggy OpenCV INTER_NEAREST + // We keep this method for BC and consider as deprecated. + // See nearest_neighbor_exact_compute_source_index as replacement + const int src_index = + min(static_cast(floorf((dst_index) * scale)), input_size - 1); + return src_index; +} + +__device__ __forceinline__ static int nearest_neighbor_exact_compute_source_index( + const float scale, + int dst_index, + int input_size) { + // index_f32 = (output_index + 0.5) * scale - 0.5 + // input_index = round(index_f32) + // Same as Pillow and Scikit-Image/Scipy ndi.zoom + const int src_index = + min(static_cast(floorf((dst_index + static_cast(0.5)) * scale)), input_size - 1); + return src_index; +} + +// see NOTE [ Nearest neighbor upsampling kernel implementation ] +__device__ __forceinline__ static int nearest_neighbor_bw_compute_source_index( + const float scale, + int dst_index, + int output_size) { + // Equivalent to buggy OpenCV INTER_NEAREST + // We keep this method for BC and consider as deprecated. + // See nearest_neighbor_exact_bw_compute_source_index as replacement + const int src_index = + min(static_cast(ceilf(dst_index * scale)), output_size); + return src_index; +} + +// see NOTE [ Nearest neighbor upsampling kernel implementation ] +__device__ __forceinline__ static int nearest_neighbor_exact_bw_compute_source_index( + const float scale, + int dst_index, + int output_size) { + // Equivalent to Pillow and Scikit-Image/Scipy ndi.zoom + const int src_index = + min(static_cast(ceilf(dst_index * scale - static_cast(0.5))), output_size); + return src_index; +} + +/* Used by UpSampleBicubic2d.cu */ +template +__device__ __forceinline__ static scalar_t upsample_get_value_bounded( + const PackedTensorAccessor64& data, + int batch, + int channel, + int height, + int width, + int y, + int x) { + int access_y = max(min(y, height - 1), 0); + int access_x = max(min(x, width - 1), 0); + return data[batch][channel][access_y][access_x]; +} + +/* Used by UpSampleBicubic2d.cu */ +template +__device__ __forceinline__ static void upsample_increment_value_bounded( + PackedTensorAccessor64& data, + int batch, + int channel, + int height, + int width, + int y, + int x, + accscalar_t value) { + int access_y = max(min(y, height - 1), 0); + int access_x = max(min(x, width - 1), 0); + /* TODO: result here is truncated to scalar_t, + check: https://github.com/pytorch/pytorch/pull/19630#discussion_r281426912 + */ + gpuAtomicAddNoReturn( + &data[batch][channel][access_y][access_x], static_cast(value)); +} + +// Based on +// https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm +template +__device__ __forceinline__ static accscalar_t cubic_convolution1( + accscalar_t x, + accscalar_t A) { + return ((A + 2) * x - (A + 3)) * x * x + 1; +} + +template +__device__ __forceinline__ static accscalar_t cubic_convolution2( + accscalar_t x, + accscalar_t A) { + return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A; +} + +template +__device__ __forceinline__ static void get_cubic_upsampling_coefficients( + accscalar_t coeffs[4], + accscalar_t t) { + accscalar_t A = -0.75; + + accscalar_t x1 = t; + coeffs[0] = cubic_convolution2(x1 + 1.0, A); + coeffs[1] = cubic_convolution1(x1, A); + + // opposite coefficients + accscalar_t x2 = 1.0 - t; + coeffs[2] = cubic_convolution1(x2, A); + coeffs[3] = cubic_convolution2(x2 + 1.0, A); +} + +template +__device__ __forceinline__ static accscalar_t cubic_interp1d( + scalar_t x0, + scalar_t x1, + scalar_t x2, + scalar_t x3, + accscalar_t t) { + accscalar_t coeffs[4]; + get_cubic_upsampling_coefficients(coeffs, t); + + return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3]; +} + +namespace upsample_antialias { + +// taken from +// https://github.com/python-pillow/Pillow/blob/6812205f18ca4ef54372e87e1a13ce4a859434df/ +// src/libImaging/Resample.c#L20-L29 +struct BilinearFilterFunctor { + + template + __device__ accscalar_t operator()(accscalar_t x) const { + if (x < 0) { + x = -x; + } + if (x < 1) { + return 1 - x; + } + return 0; + } + + static const int size = 2; +}; + +// taken from +// https://github.com/python-pillow/Pillow/blob/6812205f18ca4ef54372e87e1a13ce4a859434df/ +// src/libImaging/Resample.c#L46-L62 +struct BicubicFilterFunctor { + + template + __device__ accscalar_t operator()(accscalar_t x) const { + // https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm + const accscalar_t a = -0.5; + if (x < 0) { + x = -x; + } + if (x < 1) { + return ((a + 2) * x - (a + 3)) * x * x + 1; + } + if (x < 2) { + return (((x - 5) * x + 8) * x - 4) * a; + } + return 0; + } + + static const int size = 4; +}; + +template +__device__ __forceinline__ static void _compute_weights_span( + const int i, + const int input_size, + const accscalar_t scale, + const accscalar_t support, + int& xmin, + int& xsize, + accscalar_t& center) { + center = scale * (i + static_cast(0.5)); + xmin = max(static_cast(center - support + static_cast(0.5)), static_cast(0)); + xsize = min(static_cast(center + support + static_cast(0.5)), input_size) - xmin; +} + +template +__device__ __forceinline__ static void _compute_weights( + scalar_t* wt_ptr, + const accscalar_t scale, + int interp_size, + const interp_filter_t& interp_filter, + accscalar_t xmin_m_center, + int xsize) { + + accscalar_t invscale = (scale >= 1.0) ? 1.0 / scale : 1.0; + accscalar_t total_w = 0.0; + int j = 0; + for (j = 0; j < xsize; j++) { + accscalar_t w = interp_filter((j + xmin_m_center + static_cast(0.5)) * invscale); + wt_ptr[j] = static_cast(w); + total_w += w; + } + for (j = 0; j < xsize; j++) { + if (total_w != 0.0) { + wt_ptr[j] /= total_w; + } + } + for (; j < interp_size; j++) { + wt_ptr[j] = static_cast(0.0); + } +} + +template +__device__ __forceinline__ static accscalar_t interpolate_aa_single_dim( + const scalar_t* src, + const scalar_t* weights, + int size) { + scalar_t t = static_cast(*src); + scalar_t wts = static_cast(weights[0]); + accscalar_t output = t * wts; + + int j = 1; + for (; j < size; j++) { + wts = static_cast(weights[j]); + t = static_cast(*(src + j)); + output += t * wts; + } + return output; +} + +} + +} // namespace native +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adam_utils.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adam_utils.cuh new file mode 100644 index 0000000000000000000000000000000000000000..182195969ed9a32770876aed7ac9e060e8157e1f --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adam_utils.cuh @@ -0,0 +1,202 @@ +#pragma once +#include +#include +#include +#include +#include + +namespace at { +namespace native { + +enum class ADAM_MODE : uint8_t { ORIGINAL = 0, ADAMW = 1 }; + +namespace { + +constexpr uint8_t kParamIdx = 0; +constexpr uint8_t kGradIdx = 1; +constexpr uint8_t kExpAvgIdx = 2; +constexpr uint8_t kExpAvgSqIdx = 3; +constexpr uint8_t kMaxExpAvgSqIdx = 4; + +template < + typename scalar_type, + typename opmath_t, + int depth, + ADAM_MODE adam_mode, + bool amsgrad> +C10_DEVICE inline void adam_math( + scalar_type r_args[depth][kILP], + const double& lr, + const double& beta1, + const double& beta2, + const double& weight_decay, + const double& eps, + const bool& maximize, + const float* grad_scale_ptr, + const float* found_inf_ptr, + const opmath_t& bias_correction1, + const opmath_t& bias_correction2_sqrt) { + static_assert(depth == 4 || depth == 5); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + // Load values. + opmath_t param = static_cast(r_args[kParamIdx][ii]); + opmath_t grad = static_cast(r_args[kGradIdx][ii]); + if (grad_scale_ptr) { + grad /= (static_cast(*grad_scale_ptr)); + } + const opmath_t grad_to_store = grad; + if (maximize) { + grad = -grad; + } + opmath_t exp_avg = static_cast(r_args[kExpAvgIdx][ii]); + opmath_t exp_avg_sq = static_cast(r_args[kExpAvgSqIdx][ii]); + opmath_t max_exp_avg_sq; + if (amsgrad) { + max_exp_avg_sq = static_cast(r_args[kMaxExpAvgSqIdx][ii]); + } + // Update param, grad, 1st and 2nd order momentum. + if (weight_decay != 0) { + if constexpr (adam_mode == ADAM_MODE::ORIGINAL) { + grad += param * weight_decay; + } else if constexpr (adam_mode == ADAM_MODE::ADAMW) { + param -= lr * weight_decay * param; + } + } + // todo(crcrpar): use lerp + // ref: https://developer.nvidia.com/blog/lerp-faster-cuda/ + exp_avg = beta1 * exp_avg + (1 - beta1) * grad; + exp_avg_sq = beta2 * exp_avg_sq + (1 - beta2) * grad * grad; + const opmath_t step_size = lr / bias_correction1; + opmath_t denom; + if (amsgrad) { + max_exp_avg_sq = std::max(max_exp_avg_sq, exp_avg_sq); + denom = (std::sqrt(max_exp_avg_sq) / bias_correction2_sqrt) + eps; + } else { + denom = (std::sqrt(exp_avg_sq) / bias_correction2_sqrt) + eps; + } + param -= step_size * exp_avg / denom; + + // Store results. + r_args[kParamIdx][ii] = param; + if (grad_scale_ptr) { + r_args[kGradIdx][ii] = grad_to_store; + } + r_args[kExpAvgIdx][ii] = exp_avg; + r_args[kExpAvgSqIdx][ii] = exp_avg_sq; + if (amsgrad) { + r_args[kMaxExpAvgSqIdx][ii] = max_exp_avg_sq; + } + } +} + +// [note: Conditional Gradient Store when `optimizer.step` is called by +// GradScaler] When a user is training their model(s) with an FP16 AMP recipe, +// parameter updates are done via `grad_scaler.step(optimizer)` instead of +// `optimizer.step()`. For most optimizers, GradScaler unscales gradients on +// behalf of those optimizers. Also, before `.step`, it makes sure that all the +// gradients involved are finite, which incurs a device sync. On the other hand, +// fused optimizers set their member variable of `_step_supports_amp_scaling` to +// `True` in order to remove the device sync above. This means that fused +// optimizers have to have their CUDA kernels (a) unscale gradients and (b) skip +// parameter updates accordingly. To be functionally on par with `torch.optim` +// optimizers and `_multi_tensor` ones, the kernel below writes out gradients +// only when `grad_scale_ptr != nullptr. +template +struct FusedAdamMathFunctor { + static_assert( + depth == 4 || depth == 5, + "depth of 4 for Adam, depth of 5 for Adam with AMSGrad."); + using opmath_t = at::opmath_type; + C10_DEVICE __forceinline__ void operator()( + int chunk_size, + FusedOptimizerTensorListMetadata& tl, + const float* lr_ptr, + const double& lr, + const double& beta1, + const double& beta2, + const double& weight_decay, + const double& eps, + const bool& maximize, + const float* grad_scale_ptr, + const float* found_inf_ptr) { + const auto tensor_loc = tl.block_to_tensor[blockIdx.x]; + const auto chunk_idx = tl.block_to_chunk[blockIdx.x]; + const double lr_double = lr_ptr ? *lr_ptr : lr; + + if (found_inf_ptr && *found_inf_ptr == 1) { + return; + } + const auto [bias_correction1, bias_correction2_sqrt] = + [&]() -> std::pair { + auto* step_count = + reinterpret_cast(tl.state_steps_addresses[tensor_loc]); + const auto bias_correction1 = 1 - at::native::pow_(beta1, *step_count); + const auto bias_correction2 = 1 - at::native::pow_(beta2, *step_count); + const auto bias_correction2_sqrt = std::sqrt(bias_correction2); + return {bias_correction1, bias_correction2_sqrt}; + }(); + + scalar_type* args[depth]; + scalar_type r_args[depth][kILP]; + const auto n = tl.numel_for_tensor[tensor_loc] - chunk_idx * chunk_size; + + const bool all_aligned{ + init_args(args, tl, chunk_idx, chunk_size, tensor_loc)}; + if ((n % kILP == 0) && (chunk_size % kILP == 0) && all_aligned) { + for (int64_t i_start = threadIdx.x; + i_start * kILP < n && i_start * kILP < chunk_size; + i_start += blockDim.x) { +#pragma unroll + for (int i = 0; i < depth; i++) { + load_store(r_args[i], args[i], 0, i_start); + } + adam_math( + r_args, + lr_double, + beta1, + beta2, + weight_decay, + eps, + maximize, + grad_scale_ptr, + found_inf_ptr, + bias_correction1, + bias_correction2_sqrt); +#pragma unroll + for (int i = 0; i < depth; i++) { + if (i != kGradIdx || grad_scale_ptr) { + load_store(args[i], r_args[i], i_start, 0); + } + } + } + } else { + for (int64_t i_start = 0; i_start < n && i_start < chunk_size; + i_start += blockDim.x * kILP) { + load_args(r_args, args, i_start, chunk_size, n); + adam_math( + r_args, + lr_double, + beta1, + beta2, + weight_decay, + eps, + maximize, + grad_scale_ptr, + found_inf_ptr, + bias_correction1, + bias_correction2_sqrt); +#pragma unroll + for (int i = 0; i < depth; i++) { + if (i != kGradIdx || grad_scale_ptr) { + store_args(args[i], r_args[i], i_start, chunk_size, n); + } + } + } + } + } +}; +} // namespace + +} // namespace native +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/jit_utils.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/jit_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..575c51c96db36e48f7be2f03336cbf9beaed17fc --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/jit_utils.h @@ -0,0 +1,215 @@ +#pragma once + +#include +#include +#include +#include + +#include +#include +#include + +namespace at { namespace cuda { namespace jit { + +enum class BinaryFuncVariant {NoScalar, RhsScalar, LhsScalar}; + +struct NvrtcFunction { + CUmodule module = CUmodule(); + CUfunction function = nullptr; +}; + +struct KernelDescriptor { + std::string name; + std::string f; + c10::ScalarType f_inputs_type; + c10::ScalarType result_type; + c10::SmallVector extra_args_types; + int nInputs, nOutputs; +}; + +// Helper function to return a vector +// corresponding to the type of the arguments in parameter pack. +template +c10::SmallVector get_extra_args_types() { + return {c10::CppTypeToScalarType::value ...}; +} + +template < + typename result_type, + typename f_inputs_type, + typename... ExtraArgs> +KernelDescriptor make_kernel_descriptor( + std::string name, + std::string f, + int nInputs, + int nOutputs) { + KernelDescriptor ret; + ret.name = std::move(name); + ret.f = std::move(f); + ret.f_inputs_type = c10::CppTypeToScalarType::value; + ret.result_type = c10::CppTypeToScalarType::value; + ret.extra_args_types = get_extra_args_types(); + ret.nInputs = nInputs; + ret.nOutputs = nOutputs; + return ret; +} + +inline int can_vectorize_up_to(size_t default_alignment, void *pointer) { + auto ip = reinterpret_cast(pointer); + if (ip % (4 * default_alignment) == 0) { + return 4; + } + if (ip % (2 * default_alignment) == 0) { + return 2; + } + return 1; +} + +inline int can_vectorize_up_to(const KernelDescriptor &desc, c10::ArrayRef pointers) { + TORCH_INTERNAL_ASSERT(desc.nOutputs == 1); + TORCH_INTERNAL_ASSERT(static_cast(pointers.size()) == 1 + desc.nInputs); + + // Deals with output + auto result_size = c10::scalarTypeToTypeMeta(desc.result_type).itemsize(); + int result = can_vectorize_up_to(result_size, pointers[0]); + + // Incorporates input(s) + auto input_size = c10::scalarTypeToTypeMeta(desc.f_inputs_type).itemsize(); + for (auto i : c10::irange(1, pointers.size())) { + result = std::min(result, can_vectorize_up_to(input_size, pointers[i])); + } + + return result; +} + +std::string generate_code( + int nInputs, + int nOutputs, + const std::string& func, + const std::string& name, + const std::string& f_input_type, + const std::string& compute_type, + const std::string& result_type, + bool contiguous, + bool dynamic_casting, + BinaryFuncVariant scalar_pos, + c10::SmallVector& extra_args_typenames, + bool vectorized=false, + int vec_size=0, + bool return_by_ref=false); + +std::string generate_code( + const KernelDescriptor &desc, + bool contiguous, + bool dynamic_casting, + BinaryFuncVariant scalar_pos, + bool vectorized=false, + int vec_size=0, + bool return_by_ref=false); + +std::string generate_reduction_code( + int nOutputs, + const std::string& func, + const std::string& name, + const int vt0, + const std::string& f_inputs_type, + const std::string& reduction_accum_type, + const std::string& result_type, + bool contiguous, + bool vectorized, + int vec_size, + int max_threads_codegen); + +std::string generate_reduction_code( + const KernelDescriptor &desc, + const int vt0, + bool contiguous, + bool vectorized, + int vec_size, + int max_threads_codegen); + +NvrtcFunction jit_pwise_function( + const std::string& code, + const std::string& kernel_name); + +void launch_jitted_pwise_function( + NvrtcFunction function, + void* args[], + const dim3 nBlocks, + const dim3 kBlockSize, + const int smem=0); + +template +struct delayed_false : std::false_type { +}; + +// Defines type names +// NOTE: General case is instantiated only for invalid types. +// All the valid types have specialization using the TYPE_NAME_FN +// macro below. +template +inline std::string typeName() { + // we can't use static_assert(false) directly as the + // program will be not compiled even if the template is not + // instantiated, so we use `delayed_false` + // to make sure compiler doesn't eagerly raise + // fail this assertion. + static_assert(delayed_false::value, "invalid type for jiterator"); + return "void"; +} + +#define TYPE_NAME_FN(ctype, name) \ +template <> inline std::string typeName(){ \ + return std::string(#ctype); \ +} + +AT_FORALL_SCALAR_TYPES(TYPE_NAME_FN) +#undef TYPE_NAME_FN +// JIT uses std::complex directly, because nvRTC compile programs +// with -default-device, so there is no such issue like: +// "std::sin(complex) is __host__ only" +template <> inline std::string typeName(){ + return "bool"; +} +template <> inline std::string typeName>(){ + return "std::complex"; +} +template <> inline std::string typeName>(){ + return "std::complex"; +} +template <> inline std::string typeName>(){ + return "std::complex"; +} +template <> inline std::string typeName(){ + return "at::Half"; +} +template <> inline std::string typeName(){ + return "at::BFloat16"; +} +template <> inline std::string typeName(){ + return "at::Float8_e5m2"; +} +template <> inline std::string typeName(){ + return "at::Float8_e4m3fn"; +} +template <> inline std::string typeName() { + return "at::Float8_e5m2fnuz"; +} +template <> inline std::string typeName() { + return "at::Float8_e4m3fnuz"; +} + +#define TYPE_NAME_CASE(ctype, scalartype) \ + case ScalarType::scalartype: return typeName(); +inline std::string typeName(ScalarType t) { + switch (t) { + AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(TYPE_NAME_CASE) + default: + TORCH_CHECK(false, "invalid type for jiterator"); + } +} +#undef TYPE_NAME_CASE + +TORCH_CUDA_CPP_API void initializeCudaContext(); + +}}} // namespace at::cuda::jit diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/vol2col.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/vol2col.cuh new file mode 100644 index 0000000000000000000000000000000000000000..51dbe1c7440533a1e6376c84fe245f6a89aafb1c --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/vol2col.cuh @@ -0,0 +1,263 @@ +#pragma once + +#include +#include +#include +#include + +#include + +namespace at { +namespace native { + +using namespace at::cuda::detail; + +// Kernel for fast unfold+copy on volumes +template +__global__ void vol2col_kernel( + const int64_t n, + const T* data_vol, + const int depth, + const int height, + const int width, + const int ksize_t, + const int ksize_h, + const int ksize_w, + const int pad_t, + const int pad_h, + const int pad_w, + const int stride_t, + const int stride_h, + const int stride_w, + const int dilation_t, + const int dilation_h, + const int dilation_w, + const int depth_col, + const int height_col, + const int width_col, + T* data_col) { + CUDA_KERNEL_LOOP(index, n) { + auto w_out = index % width_col; + index /= width_col; + auto h_out = index % height_col; + index /= height_col; + auto t_out = index % depth_col; + auto channel_in = index / depth_col; + auto channel_out = channel_in * ksize_t * ksize_h * ksize_w; + auto t_in = t_out * stride_t - pad_t; + auto h_in = h_out * stride_h - pad_h; + auto w_in = w_out * stride_w - pad_w; + data_col += + ((channel_out * depth_col + t_out) * height_col + h_out) * width_col + + w_out; + data_vol += ((channel_in * depth + t_in) * height + h_in) * width + w_in; + for (int i = 0; i < ksize_t; ++i) { + for (int j = 0; j < ksize_h; ++j) { + for (int k = 0; k < ksize_w; ++k) { + auto t = t_in + i * dilation_t; + auto h = h_in + j * dilation_h; + auto w = w_in + k * dilation_w; + *data_col = (t >= 0 && h >= 0 && w >= 0 && t < depth && h < height && + w < width) + ? data_vol + [i * dilation_t * height * width + j * dilation_h * width + + k * dilation_w] + : static_cast(0); + data_col += depth_col * height_col * width_col; + } + } + } + } +} + +template +void vol2col( + cudaStream_t stream, + const T* data_vol, + const int channels, + const int depth, + const int height, + const int width, + const int depth_col, + const int height_col, + const int width_col, + const int ksize_t, + const int ksize_h, + const int ksize_w, + const int pad_t, + const int pad_h, + const int pad_w, + const int stride_t, + const int stride_h, + const int stride_w, + const int dilation_t, + const int dilation_h, + const int dilation_w, + T* data_col) { + // We are going to launch channels * depth_col * height_col * width_col + // kernels, each kernel responsible for copying a single-channel grid. + // We cast an operand to int64 so that the product will not overflow + const auto num_kernels = static_cast(channels) * depth_col * height_col * width_col; + // Launch + vol2col_kernel<<>>( + num_kernels, + data_vol, + depth, + height, + width, + ksize_t, + ksize_h, + ksize_w, + pad_t, + pad_h, + pad_w, + stride_t, + stride_h, + stride_w, + dilation_t, + dilation_h, + dilation_w, + depth_col, + height_col, + width_col, + data_col); + C10_CUDA_KERNEL_LAUNCH_CHECK(); +} + +template +__global__ void vol2im_kernel( + const int64_t n, + const T* data_col, + const unsigned depth, + const unsigned height, + const unsigned width, + const unsigned channels, + const unsigned kernel_t, + const unsigned kernel_h, + const unsigned kernel_w, + const unsigned pad_t, + const unsigned pad_h, + const unsigned pad_w, + const unsigned stride_t, + const unsigned stride_h, + const unsigned stride_w, + const unsigned dilation_t, + const unsigned dilation_h, + const unsigned dilation_w, + const unsigned depth_col, + const unsigned height_col, + const unsigned width_col, + T* data_vol) { + CUDA_KERNEL_LOOP(index, n) { + accT val = static_cast(0); + const auto w_im = index % width + pad_w; + const auto h_im = (index / width) % height + pad_h; + const auto t_im = (index / width / height) % depth + pad_t; + const auto c_im = index / (width * height * depth); + auto kernel_extent_w = (kernel_w - 1) * dilation_w + 1; + auto kernel_extent_h = (kernel_h - 1) * dilation_h + 1; + auto kernel_extent_t = (kernel_t - 1) * dilation_t + 1; + // compute the start and end of the output + const auto w_col_start = + (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1; + const auto w_col_end = std::min(w_im / stride_w + 1, width_col); + const auto h_col_start = + (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1; + const auto h_col_end = std::min(h_im / stride_h + 1, height_col); + const auto t_col_start = + (t_im < kernel_extent_t) ? 0 : (t_im - kernel_extent_t) / stride_t + 1; + const auto t_col_end = std::min(t_im / stride_t + 1, depth_col); + // TODO: use LCM of stride and dilation to avoid unnecessary loops + for (unsigned t_col = t_col_start; t_col < t_col_end; t_col += 1) { + for (unsigned h_col = h_col_start; h_col < h_col_end; h_col += 1) { + for (unsigned w_col = w_col_start; w_col < w_col_end; w_col += 1) { + uint64_t t_k = (t_im - t_col * stride_t); + uint64_t h_k = (h_im - h_col * stride_h); + uint64_t w_k = (w_im - w_col * stride_w); + if (t_k % dilation_t == 0 && h_k % dilation_h == 0 && + w_k % dilation_w == 0) { + t_k /= dilation_t; + h_k /= dilation_h; + w_k /= dilation_w; + const int64_t idx_k = + ((c_im * kernel_t + t_k) * kernel_h + h_k) * kernel_w + w_k; + const int64_t data_col_index = + ((idx_k * depth_col + t_col) * + height_col + h_col) * + width_col + w_col; + val += data_col[data_col_index]; + } + } + } + } + data_vol[index] = static_cast(val); + } +} + +template +void col2vol( + cudaStream_t stream, + const T* data_col, + const int64_t channels, + const int64_t depth, + const int64_t height, + const int64_t width, + const int64_t output_depth, + const int64_t output_height, + const int64_t output_width, + const int64_t patch_t, + const int64_t patch_h, + const int64_t patch_w, + const int64_t pad_t, + const int64_t pad_h, + const int64_t pad_w, + const int64_t stride_t, + const int64_t stride_h, + const int64_t stride_w, + const int64_t dilation_t, + const int64_t dilation_h, + const int64_t dilation_w, + T* data_vol) { + const auto num_kernels = channels * depth * height * width; + + auto check_fits_in_unsigned = + [](int64_t val, const char * name) { + constexpr auto umax = std::numeric_limits::max(); + TORCH_CHECK(val >= 0 && val <= umax, + name, " must fit in a 32-bit unsigned value"); + }; + check_fits_in_unsigned(num_kernels, "input size"); + check_fits_in_unsigned( + channels * patch_t * patch_h * patch_w, "channels x kernel size"); + + // To avoid involving atomic operations, we will launch one kernel per + // bottom dimension, and then in the kernel add up the top dimensions. + vol2im_kernel + <<>>( + num_kernels, + data_col, + depth, + height, + width, + channels, + patch_t, + patch_h, + patch_w, + pad_t, + pad_h, + pad_w, + stride_t, + stride_h, + stride_w, + dilation_t, + dilation_h, + dilation_w, + output_depth, + output_height, + output_width, + data_vol); + C10_CUDA_KERNEL_LAUNCH_CHECK(); +} + +} // namespace native +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/im2col.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/im2col.h new file mode 100644 index 0000000000000000000000000000000000000000..df94723ab2a216dcc2f98a6a373b46cb239a42b3 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/im2col.h @@ -0,0 +1,149 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +#include + +namespace at::native { + +template +static void im2col( + const T* data_im, + const int64_t channels, + const int64_t height, + const int64_t width, + const int64_t output_height, + const int64_t output_width, + const int64_t kernel_h, + const int64_t kernel_w, + const int64_t pad_h, + const int64_t pad_w, + const int64_t stride_h, + const int64_t stride_w, + const int64_t dilation_h, + const int64_t dilation_w, + T* data_col, + bool is_channels_last = false) { + const int64_t height_col = output_height; + const int64_t width_col = output_width; + const int64_t channels_col = channels * kernel_h * kernel_w; + + if (is_channels_last) { + at::parallel_for(0, height_col * width_col, 0, [&](int64_t begin, int64_t end) { + int64_t h_col{0}, w_col{0}; + data_index_init(begin, h_col, height_col, w_col, width_col); + + for (const auto i_col : c10::irange(begin, end)) { + for (const auto h_offset : c10::irange(kernel_h)) { + int64_t h_im = h_col * stride_h - pad_h + h_offset * dilation_h; + for (const auto w_offset : c10::irange(kernel_w)) { + int64_t w_im = w_col * stride_w - pad_w + w_offset * dilation_w; + + const T* slice_im = data_im + (h_im * width + w_im) * channels; + T* slice_col = data_col + (i_col * kernel_h * kernel_w + h_offset * kernel_w + w_offset) * channels; + + if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) { + std::copy_n(slice_im, channels, slice_col); + } else { + std::fill_n(slice_col, channels, T(0)); + } + } + } + + // move the next index + data_index_step(h_col, height_col, w_col, width_col); + } + }); + } else { + at::parallel_for(0, channels_col, 0, [&](int64_t begin, int64_t end) { + int64_t c_im{0}, h_offset{0}, w_offset{0}; + data_index_init(begin, c_im, channels, h_offset, kernel_h, w_offset, kernel_w); + + for (const auto c_col : c10::irange(begin, end)) { + for (const auto h_col : c10::irange(height_col)) { + int64_t h_im = h_col * stride_h - pad_h + h_offset * dilation_h; + for (const auto w_col : c10::irange(width_col)) { + int64_t w_im = w_col * stride_w - pad_w + w_offset * dilation_w; + data_col[(c_col * height_col + h_col) * width_col + w_col] = + (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) + ? data_im[(c_im * height + h_im) * width + w_im] + : static_cast(0); + } + } + + // move to the next index + data_index_step(c_im, channels, h_offset, kernel_h, w_offset, kernel_w); + } + }); + } +} + +template +static void col2im( + const T* data_col, + const int64_t channels, + const int64_t height, + const int64_t width, + const int64_t output_height, + const int64_t output_width, + const int64_t kernel_h, + const int64_t kernel_w, + const int64_t pad_h, + const int64_t pad_w, + const int64_t stride_h, + const int64_t stride_w, + const int64_t dilation_h, + const int64_t dilation_w, + T* data_im, + bool is_channels_last = false) { + std::fill_n(data_im, height * width * channels, T(0)); + + const int64_t height_col = output_height; + const int64_t width_col = output_width; + const int64_t channels_col = channels * kernel_h * kernel_w; + + if (is_channels_last) { + for (const auto h_col : c10::irange(height_col)) { + for (const auto w_col : c10::irange(width_col)) { + for (const auto h_offset : c10::irange(kernel_h)) { + int64_t h_im = h_col * stride_h - pad_h + h_offset * dilation_h; + for (const auto w_offset : c10::irange(kernel_w)) { + int64_t w_im = w_col * stride_w - pad_w + w_offset * dilation_w; + + T* slice_im = data_im + (h_im * width + w_im) * channels; + const T* slice_col = data_col + ((h_col * width_col + w_col) * kernel_h * kernel_w + + h_offset * kernel_w + w_offset) * channels; + + if (h_im >= 0 && h_im < height && w_im >= 0 && w_im < width) { + std::transform(slice_col, slice_col + channels, slice_im, slice_im, std::plus()); + } + } + } + } + } + } else { + for (const auto c_col : c10::irange(channels_col)) { + int64_t w_offset = c_col % kernel_w; + int64_t h_offset = (c_col / kernel_w) % kernel_h; + int64_t c_im = c_col / kernel_h / kernel_w; + + for (const auto h_col : c10::irange(height_col)) { + int64_t h_im = h_col * stride_h - pad_h + h_offset * dilation_h; + for (const auto w_col : c10::irange(width_col)) { + int64_t w_im = w_col * stride_w - pad_w + w_offset * dilation_w; + + if (h_im >= 0 && h_im < height && w_im >= 0 && w_im < width) + data_im[(c_im * height + h_im) * width + w_im] += + data_col[(c_col * height_col + h_col) * width_col + w_col]; + } + } + } + } +} + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/layer_norm.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/layer_norm.h new file mode 100644 index 0000000000000000000000000000000000000000..13fb1e4783d20bbbc68cfebc9cf609ba3274bb25 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/layer_norm.h @@ -0,0 +1,100 @@ +#pragma once + +#include +#include +#include + +namespace at::native { + +namespace { + +C10_ALWAYS_INLINE std::pair _check_layer_norm_inputs( + const Tensor& input, + IntArrayRef normalized_shape, + const Tensor& weight /* optional */, + const Tensor& bias /* optional */) { + + const int normalized_ndim = normalized_shape.size(); + TORCH_CHECK( + normalized_ndim >= 1, + "Expected normalized_shape to be at least 1-dimensional, i.e., ", + "containing at least one element, but got normalized_shape = ", + normalized_shape); + TORCH_CHECK( + !weight.defined() || weight.sizes().equals(normalized_shape), + "Expected weight to be of same shape as normalized_shape, but got ", + "weight of shape ", + weight.sizes(), + " and normalized_shape = ", + normalized_shape); + TORCH_CHECK( + !bias.defined() || bias.sizes().equals(normalized_shape), + "Expected bias to be of same shape as normalized_shape, but got ", + "bias of shape ", + bias.sizes(), + " and normalized_shape = ", + normalized_shape); + + const auto input_shape = input.sizes(); + const auto input_ndim = input.dim(); + + if (input_ndim < normalized_ndim || + !input_shape.slice(input_ndim - normalized_ndim) + .equals(normalized_shape)) { + std::stringstream ss; + ss << "Given normalized_shape=" << normalized_shape + << ", expected input with shape [*"; + for (auto size : normalized_shape) { + ss << ", " << size; + } + ss << "], but got input of size" << input_shape; + AT_ERROR(ss.str()); + } + + const int axis = input_ndim - normalized_ndim; + const int64_t M = + c10::multiply_integers(input_shape.cbegin(), input_shape.cbegin() + axis); + const int64_t N = + c10::multiply_integers(input_shape.cbegin() + axis, input_shape.cend()); + + return std::make_pair(M, N); +} + +} // namespace + +void layer_norm_cpu_out( + at::Tensor& out, + const at::Tensor& input, + const Tensor& gamma, + const Tensor& beta, + double eps, + int64_t M, + int64_t N); + +using forward_fn = void (*)( + const Tensor& /* X */, + const Tensor& /* gamma */, + const Tensor& /* beta */, + int64_t /* M */, + int64_t /* N */, + double /* eps */, + Tensor* /* Y */, + Tensor* /* mean */, + Tensor* /* rstd */); + +using backward_fn = void (*)( + const Tensor& /* dY */, + const Tensor& /* X */, + const Tensor& /* mean */, + const Tensor& /* rstd */, + const Tensor& /* gamma */, + int64_t /* M */, + int64_t /* N */, + Tensor* /* dX */, + Tensor* /* dgamma */, + Tensor* /* dbeta */); + +DECLARE_DISPATCH(forward_fn, LayerNormKernel); +DECLARE_DISPATCH(backward_fn, LayerNormBackwardKernel); + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/mps/Copy.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/mps/Copy.h new file mode 100644 index 0000000000000000000000000000000000000000..4ffa73d039adf20566af0d0c9c45d899bbc36fab --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/mps/Copy.h @@ -0,0 +1,15 @@ +// Copyright © 2022 Apple Inc. + +#pragma once +#include + +namespace at { +namespace native { +namespace mps { + +at::Tensor& mps_copy_(at::Tensor& dst, const at::Tensor& src, bool non_blocking); +void copy_blit_mps(void* dst, const void* src, size_t size); + +} // namespace mps +} // namespace native +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/mps/MPSGraphSonomaOps.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/mps/MPSGraphSonomaOps.h new file mode 100644 index 0000000000000000000000000000000000000000..56b0f7d9a03f7254dc864e385f1f556341c56108 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/mps/MPSGraphSonomaOps.h @@ -0,0 +1,49 @@ +#pragma once + +#include + +#if !defined(__MAC_14_0) && \ + (!defined(MAC_OS_X_VERSION_14_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_14_0)) + +typedef NS_ENUM(NSUInteger, MPSGraphFFTScalingMode) +{ + MPSGraphFFTScalingModeNone = 0L, + MPSGraphFFTScalingModeSize = 1L, + MPSGraphFFTScalingModeUnitary = 2L, +}; + +@interface FakeMPSGraphFFTDescriptor : NSObject +@property (readwrite, nonatomic) BOOL inverse; +@property (readwrite, nonatomic) MPSGraphFFTScalingMode scalingMode; +@property (readwrite, nonatomic) BOOL roundToOddHermitean; ++(nullable instancetype) descriptor; +@end + +@compatibility_alias MPSGraphFFTDescriptor FakeMPSGraphFFTDescriptor; + +@interface MPSGraph (SonomaOps) +-(MPSGraphTensor * _Nonnull) conjugateWithTensor:(MPSGraphTensor * _Nonnull) tensor + name:(NSString * _Nullable) name; + +-(MPSGraphTensor * _Nonnull) fastFourierTransformWithTensor:(MPSGraphTensor * _Nonnull) tensor + axes:(NSArray * _Nonnull) axes + descriptor:(MPSGraphFFTDescriptor * _Nonnull) descriptor + name:(NSString * _Nullable) name; + +-(MPSGraphTensor * _Nonnull) realToHermiteanFFTWithTensor:(MPSGraphTensor * _Nonnull) tensor + axes:(NSArray * _Nonnull) axes + descriptor:(MPSGraphFFTDescriptor * _Nonnull) descriptor + name:(NSString * _Nullable) name; + +-(MPSGraphTensor * _Nonnull) HermiteanToRealFFTWithTensor:(MPSGraphTensor * _Nonnull) tensor + axes:(NSArray * _Nonnull) axes + descriptor:(MPSGraphFFTDescriptor * _Nonnull) descriptor + name:(NSString * _Nullable) name; +@end + +// define BFloat16 enums for MacOS13 +#define MPSDataTypeBFloat16 ((MPSDataType) (MPSDataTypeAlternateEncodingBit | MPSDataTypeFloat16)) + +// define Metal version +#define MTLLanguageVersion3_1 ((MTLLanguageVersion) ((3 << 16) + 1)) +#endif diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/mps/UnaryConstants.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/mps/UnaryConstants.h new file mode 100644 index 0000000000000000000000000000000000000000..4adf1d0e333e3aa416efb33fea8368dab009a689 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/mps/UnaryConstants.h @@ -0,0 +1,43 @@ +#pragma once + +const char* UNARY_KERNEL_TEMPLATE = R"METAL( +#include +using namespace metal; + +constant float a[4] = {{0.886226899, -1.645349621, 0.914624893, -0.140543331}}; +constant float b[4] = {{-2.118377725, 1.442710462, -0.329097515, 0.012229801}}; +constant float c[4] = {{-1.970840454, -1.624906493, 3.429567803, 1.641345311}}; +constant float d[2] = {{3.543889200, 1.637067800}}; + +kernel void erfinv_mps_kernel( device {0} *output [[buffer(0)]], + device {1} *input [[buffer(1)]], + uint index [[thread_position_in_grid]]) {{ + + float y = input[index]; + float x, z, num, dem; /*working variables */ + /* coefficients in rational expansion */ + + float y_abs = abs(y); + if(y_abs > 1.0f){{ + output[index] = NAN; + return; + }} + if(y_abs == 1.0f){{ + output[index] = copysign(INFINITY, y); + return; + }} + if(y_abs <= 0.7f) {{ + z = y * y; + num = (((a[3]*z + a[2])*z + a[1])*z + a[0]); + dem = ((((b[3]*z + b[2])*z + b[1])*z +b[0]) * z + 1.0f); + x = y * num / dem; + }} + else{{ + z = sqrt(-1.0f*log((1.0-y_abs)/2.0)); + num = ((c[3]*z + c[2])*z + c[1]) * z + c[0]; + dem = (d[1]*z + d[0])*z + 1.0f; + x = copysign(num, y) / dem; + }} + + output[index] = x; +}})METAL"; \ No newline at end of file diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/__pycache__/fake_quantize.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/__pycache__/fake_quantize.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..859dfe68659cdbda3adf154744e1aa18ad413812 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/__pycache__/fake_quantize.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/__pycache__/observer.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/__pycache__/observer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6c4e312e08a79a8b6ab21e7f400b501e929f8344 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/__pycache__/observer.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/__pycache__/quantize.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/__pycache__/quantize.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..44e336745606d9f921ece96c30af0676539c9103 Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/__pycache__/quantize.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/__pycache__/utils.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/__pycache__/utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dc4c70d2a462f155178de839c8c651811f8b09ff Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/quantization/__pycache__/utils.cpython-311.pyc differ